summaryrefslogtreecommitdiff
path: root/usr/src/cmd/bhyve
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/cmd/bhyve')
-rw-r--r--usr/src/cmd/bhyve/Makefile167
-rw-r--r--usr/src/cmd/bhyve/acpi.c983
-rw-r--r--usr/src/cmd/bhyve/acpi.h56
-rw-r--r--usr/src/cmd/bhyve/ahci.h324
-rw-r--r--usr/src/cmd/bhyve/atkbdc.c586
-rw-r--r--usr/src/cmd/bhyve/atkbdc.h38
-rw-r--r--usr/src/cmd/bhyve/bhyve_sol_glue.c39
-rw-r--r--usr/src/cmd/bhyve/bhyvegc.c103
-rw-r--r--usr/src/cmd/bhyve/bhyvegc.h48
-rw-r--r--usr/src/cmd/bhyve/bhyverun.c1395
-rw-r--r--usr/src/cmd/bhyve/bhyverun.h74
-rw-r--r--usr/src/cmd/bhyve/block_if.c1028
-rw-r--r--usr/src/cmd/bhyve/block_if.h84
-rw-r--r--usr/src/cmd/bhyve/bootrom.c113
-rw-r--r--usr/src/cmd/bhyve/bootrom.h40
-rw-r--r--usr/src/cmd/bhyve/console.c120
-rw-r--r--usr/src/cmd/bhyve/console.h55
-rw-r--r--usr/src/cmd/bhyve/consport.c180
-rw-r--r--usr/src/cmd/bhyve/dbgport.c180
-rw-r--r--usr/src/cmd/bhyve/dbgport.h36
-rw-r--r--usr/src/cmd/bhyve/fwctl.c552
-rw-r--r--usr/src/cmd/bhyve/fwctl.h56
-rw-r--r--usr/src/cmd/bhyve/gdb.c1332
-rw-r--r--usr/src/cmd/bhyve/gdb.h39
-rw-r--r--usr/src/cmd/bhyve/inout.c299
-rw-r--r--usr/src/cmd/bhyve/inout.h93
-rw-r--r--usr/src/cmd/bhyve/ioapic.c83
-rw-r--r--usr/src/cmd/bhyve/ioapic.h43
-rw-r--r--usr/src/cmd/bhyve/iov.c148
-rw-r--r--usr/src/cmd/bhyve/iov.h44
-rw-r--r--usr/src/cmd/bhyve/mem.c361
-rw-r--r--usr/src/cmd/bhyve/mem.h65
-rw-r--r--usr/src/cmd/bhyve/mevent.c680
-rw-r--r--usr/src/cmd/bhyve/mevent.h53
-rw-r--r--usr/src/cmd/bhyve/mevent_test.c282
-rw-r--r--usr/src/cmd/bhyve/mptbl.c379
-rw-r--r--usr/src/cmd/bhyve/mptbl.h37
-rw-r--r--usr/src/cmd/bhyve/pci_ahci.c2485
-rw-r--r--usr/src/cmd/bhyve/pci_e82545.c2418
-rw-r--r--usr/src/cmd/bhyve/pci_emul.c2141
-rw-r--r--usr/src/cmd/bhyve/pci_emul.h298
-rw-r--r--usr/src/cmd/bhyve/pci_fbuf.c467
-rw-r--r--usr/src/cmd/bhyve/pci_hostbridge.c236
-rw-r--r--usr/src/cmd/bhyve/pci_irq.c354
-rw-r--r--usr/src/cmd/bhyve/pci_irq.h47
-rw-r--r--usr/src/cmd/bhyve/pci_lpc.c481
-rw-r--r--usr/src/cmd/bhyve/pci_lpc.h76
-rw-r--r--usr/src/cmd/bhyve/pci_nvme.c1897
-rw-r--r--usr/src/cmd/bhyve/pci_passthru.c910
-rw-r--r--usr/src/cmd/bhyve/pci_uart.c121
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_block.c485
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_console.c701
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_net.c1169
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_rnd.c209
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_scsi.c737
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_viona.c837
-rw-r--r--usr/src/cmd/bhyve/pci_xhci.c2862
-rw-r--r--usr/src/cmd/bhyve/pci_xhci.h355
-rw-r--r--usr/src/cmd/bhyve/pm.c378
-rw-r--r--usr/src/cmd/bhyve/post.c55
-rw-r--r--usr/src/cmd/bhyve/ps2kbd.c383
-rw-r--r--usr/src/cmd/bhyve/ps2kbd.h41
-rw-r--r--usr/src/cmd/bhyve/ps2mouse.c418
-rw-r--r--usr/src/cmd/bhyve/ps2mouse.h43
-rw-r--r--usr/src/cmd/bhyve/rfb.c1148
-rw-r--r--usr/src/cmd/bhyve/rfb.h42
-rw-r--r--usr/src/cmd/bhyve/rtc.c131
-rw-r--r--usr/src/cmd/bhyve/rtc.h36
-rw-r--r--usr/src/cmd/bhyve/smbiostbl.c907
-rw-r--r--usr/src/cmd/bhyve/smbiostbl.h43
-rw-r--r--usr/src/cmd/bhyve/sockstream.c86
-rw-r--r--usr/src/cmd/bhyve/sockstream.h35
-rw-r--r--usr/src/cmd/bhyve/spinup_ap.c110
-rw-r--r--usr/src/cmd/bhyve/spinup_ap.h36
-rw-r--r--usr/src/cmd/bhyve/task_switch.c941
-rw-r--r--usr/src/cmd/bhyve/test/Makefile18
-rw-r--r--usr/src/cmd/bhyve/test/Makefile.com60
-rw-r--r--usr/src/cmd/bhyve/test/Makefile.subdirs29
-rw-r--r--usr/src/cmd/bhyve/test/Makefile.targ55
-rw-r--r--usr/src/cmd/bhyve/test/scripts/Makefile28
-rw-r--r--usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh231
-rw-r--r--usr/src/cmd/bhyve/test/tst/Makefile18
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/Makefile30
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c172
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/mevent.c57
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/read.disable.c163
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/read.pause.c152
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c108
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/testlib.c69
-rw-r--r--usr/src/cmd/bhyve/test/tst/mevent/testlib.h88
-rw-r--r--usr/src/cmd/bhyve/uart_emul.c955
-rw-r--r--usr/src/cmd/bhyve/uart_emul.h47
-rw-r--r--usr/src/cmd/bhyve/usb_emul.c78
-rw-r--r--usr/src/cmd/bhyve/usb_emul.h164
-rw-r--r--usr/src/cmd/bhyve/usb_mouse.c802
-rw-r--r--usr/src/cmd/bhyve/vga.c1357
-rw-r--r--usr/src/cmd/bhyve/vga.h162
-rw-r--r--usr/src/cmd/bhyve/virtio.c794
-rw-r--r--usr/src/cmd/bhyve/virtio.h484
-rw-r--r--usr/src/cmd/bhyve/xmsr.c239
-rw-r--r--usr/src/cmd/bhyve/xmsr.h38
-rw-r--r--usr/src/cmd/bhyve/zhyve.c167
102 files changed, 40879 insertions, 0 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
new file mode 100644
index 0000000000..0ad066e6d4
--- /dev/null
+++ b/usr/src/cmd/bhyve/Makefile
@@ -0,0 +1,167 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Pluribus Networks Inc.
+# Copyright (c) 2018, Joyent, Inc.
+#
+
+PROG = bhyve
+
+include ../Makefile.cmd
+include ../Makefile.cmd.64
+include ../Makefile.ctf
+
+SUBDIRS = test
+
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
+
+SRCS = acpi.c \
+ atkbdc.c \
+ bhyvegc.c \
+ bhyverun.c \
+ block_if.c \
+ bootrom.c \
+ console.c \
+ consport.c \
+ dbgport.c \
+ fwctl.c \
+ gdb.c \
+ inout.c \
+ ioapic.c \
+ mem.c \
+ mevent.c \
+ mptbl.c \
+ pci_ahci.c \
+ pci_e82545.c \
+ pci_emul.c \
+ pci_fbuf.c \
+ pci_hostbridge.c \
+ pci_irq.c \
+ pci_lpc.c \
+ pci_nvme.c \
+ pci_passthru.c \
+ pci_uart.c \
+ pci_virtio_block.c \
+ pci_virtio_console.c \
+ pci_virtio_net.c \
+ pci_virtio_rnd.c \
+ pci_virtio_viona.c \
+ pci_xhci.c \
+ pm.c \
+ post.c \
+ ps2kbd.c \
+ ps2mouse.c \
+ rfb.c \
+ rtc.c \
+ smbiostbl.c \
+ sockstream.c \
+ task_switch.c \
+ uart_emul.c \
+ usb_emul.c \
+ usb_mouse.c \
+ vga.c \
+ virtio.c \
+ vmm_instruction_emul.c \
+ xmsr.c \
+ spinup_ap.c \
+ iov.c \
+ bhyve_sol_glue.c
+
+# The virtio-scsi driver appears to include a slew of materials from FreeBSD's
+# native SCSI implementation. We will omit that complexity for now.
+ #ctl_util.c \
+ #ctl_scsi_all.c \
+ #pci_virtio_scsi.c \
+
+
+OBJS = $(SRCS:.c=.o)
+
+CLOBBERFILES = $(ROOTUSRSBINPROG) $(ZHYVE)
+
+ZHYVE_DIR = $(ROOT)/usr/lib/brand/bhyve
+ZHYVE_PROG = zhyve
+ZHYVE = $(ZHYVE_DIR)/$(ZHYVE_PROG)
+
+MEVENT_TEST_PROG = mevent_test
+MEVENT_TEST_SRCS = mevent.c mevent_test.c
+MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o)
+
+CLEANFILES = $(PROG) $(ZHYVE_PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS)
+
+CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+ -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \
+ -I$(CONTRIB)/freebsd/dev/usb/controller \
+ -I$(CONTRIB)/freebsd/dev/mii \
+ -I$(SRC)/uts/common/io/e1000api \
+ $(CPPFLAGS.master) \
+ -I$(ROOT)/usr/platform/i86pc/include \
+ -I$(SRC)/uts/i86pc/io/vmm \
+ -I$(SRC)/uts/common \
+ -I$(SRC)/uts/i86pc \
+ -I$(SRC)/lib/libdladm/common \
+ -DWITHOUT_CAPSICUM
+
+# Disable the crypto code until it is wired up
+CPPFLAGS += -DNO_OPENSSL
+
+pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign
+
+# Force c99 for everything
+CSTD= $(CSTD_GNU99)
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+
+$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz
+$(ZHYVE_PROG) := LDLIBS += -lnvpair
+$(MEVENT_TEST_PROG) := LDLIBS += -lsocket
+
+POST_PROCESS += ; $(GENSETDEFS) $@
+
+.KEEP_STATE:
+
+all: $(PROG) $(MEVENT_TEST_PROG) $(ZHYVE_PROG) $(SUBDIRS)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+ $(POST_PROCESS)
+
+$(MEVENT_TEST_PROG): $(MEVENT_TEST_OBJS)
+ $(LINK.c) -o $@ $(MEVENT_TEST_OBJS) $(LDFLAGS) $(LDLIBS)
+
+install: all $(ZHYVE) $(ROOTUSRSBINPROG) $(SUBDIRS)
+
+clean: $(SUBDIRS)
+ $(RM) $(OBJS) $(CLEANFILES)
+
+clobber: clean $(SUBDIRS)
+ $(RM) $(CLOBBERFILES)
+
+lint: lint_SRCS $(SUBDIRS)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
+
+$(ZHYVE_DIR)/%: %
+ $(INS.file)
+
+%.o: $(SRC)/uts/i86pc/io/vmm/%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c
new file mode 100644
index 0000000000..309ba98a11
--- /dev/null
+++ b/usr/src/cmd/bhyve/acpi.c
@@ -0,0 +1,983 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * bhyve ACPI table generator.
+ *
+ * Create the minimal set of ACPI tables required to boot FreeBSD (and
+ * hopefully other o/s's) by writing out ASL template files for each of
+ * the tables and the compiling them to AML with the Intel iasl compiler.
+ * The AML files are then read into guest memory.
+ *
+ * The tables are placed in the guest's ROM area just below 1MB physical,
+ * above the MPTable.
+ *
+ * Layout
+ * ------
+ * RSDP -> 0xf2400 (36 bytes fixed)
+ * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used)
+ * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used)
+ * MADT -> 0xf2500 (depends on #CPUs)
+ * FADT -> 0xf2600 (268 bytes)
+ * HPET -> 0xf2740 (56 bytes)
+ * MCFG -> 0xf2780 (60 bytes)
+ * FACS -> 0xf27C0 (64 bytes)
+ * DSDT -> 0xf2800 (variable - can go up to 0x100000)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+
+#include <paths.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "pci_emul.h"
+
+/*
+ * Define the base address of the ACPI tables, and the offsets to
+ * the individual tables
+ */
+#define BHYVE_ACPI_BASE 0xf2400
+#define RSDT_OFFSET 0x040
+#define XSDT_OFFSET 0x080
+#define MADT_OFFSET 0x100
+#define FADT_OFFSET 0x200
+#define HPET_OFFSET 0x340
+#define MCFG_OFFSET 0x380
+#define FACS_OFFSET 0x3C0
+#define DSDT_OFFSET 0x400
+
+#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
+#define BHYVE_ASL_SUFFIX ".aml"
+#define BHYVE_ASL_COMPILER "/usr/sbin/iasl"
+
+static int basl_keep_temps;
+static int basl_verbose_iasl;
+static int basl_ncpu;
+static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
+static uint32_t hpet_capabilities;
+
+/*
+ * Contains the full pathname of the template to be passed
+ * to mkstemp/mktemps(3)
+ */
+static char basl_template[MAXPATHLEN];
+static char basl_stemplate[MAXPATHLEN];
+
+/*
+ * State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
+ */
+static FILE *dsdt_fp;
+static int dsdt_indent_level;
+static int dsdt_error;
+
+struct basl_fio {
+ int fd;
+ FILE *fp;
+ char f_name[MAXPATHLEN];
+};
+
+#define EFPRINTF(...) \
+ if (fprintf(__VA_ARGS__) < 0) goto err_exit;
+
+#define EFFLUSH(x) \
+ if (fflush(x) != 0) goto err_exit;
+
+static int
+basl_fwrite_rsdp(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDP template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
+ EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
+ basl_acpi_base + RSDT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
+ EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
+ basl_acpi_base + XSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_rsdt(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT, FADT and HPET */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
+ basl_acpi_base + FADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
+ basl_acpi_base + HPET_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+ basl_acpi_base + MCFG_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_xsdt(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve XSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT, FADT and HPET */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
+ basl_acpi_base + FADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
+ basl_acpi_base + HPET_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+ basl_acpi_base + MCFG_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_madt(FILE *fp)
+{
+ int i;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve MADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add a Processor Local APIC entry for each CPU */
+ for (i = 0; i < basl_ncpu; i++) {
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 08\n");
+ /* iasl expects hex values for the proc and apic id's */
+ EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i);
+ EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i);
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
+ EFPRINTF(fp, "\n");
+ }
+
+ /* Always a single IOAPIC entry, with ID 0 */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
+ /* iasl expects a hex value for the i/o apic id */
+ EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0);
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+ EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+ EFPRINTF(fp, "[0001]\t\tSource : 00\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n");
+ EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
+ EFPRINTF(fp, "\t\t\tPolarity : 1\n");
+ EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+ EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+ EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT);
+ EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT);
+ EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tPolarity : 3\n");
+ EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n");
+ EFPRINTF(fp, "\n");
+
+ /* Local APIC NMI is connected to LINT 1 on all CPUs */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 06\n");
+ EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n");
+ EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
+ EFPRINTF(fp, "\t\t\tPolarity : 1\n");
+ EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
+ EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n");
+ EFPRINTF(fp, "\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_fadt(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tModel : 01\n");
+ EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
+ EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n",
+ SCI_INT);
+ EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n",
+ SMI_CMD);
+ EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n",
+ BHYVE_ACPI_ENABLE);
+ EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n",
+ BHYVE_ACPI_DISABLE);
+ EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
+ EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n",
+ PM1A_EVT_ADDR);
+ EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n",
+ PM1A_CNT_ADDR);
+ EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
+ IO_PMTMR);
+ EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
+ EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
+ EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n");
+ EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n");
+ EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
+ EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
+ EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tReset Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n");
+ EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n");
+ EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n");
+ EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n");
+ EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n");
+ EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+ PM1A_EVT_ADDR);
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+ PM1A_CNT_ADDR);
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Valid for bhyve */
+ EFPRINTF(fp,
+ "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+ IO_PMTMR);
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_hpet(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve HPET template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities);
+ EFPRINTF(fp,
+ "[0012]\t\tTimer Block Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n");
+ EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n");
+ EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_mcfg(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve MCFG template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+ EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+ EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+ EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+ EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+ EFFLUSH(fp);
+ return (0);
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_facs(FILE *fp)
+{
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FACS template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
+ EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
+ EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
+ EFPRINTF(fp,
+ "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
+ EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+ EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+/*
+ * Helper routines for writing to the DSDT from other modules.
+ */
+void
+dsdt_line(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (dsdt_error != 0)
+ return;
+
+ if (strcmp(fmt, "") != 0) {
+ if (dsdt_indent_level != 0)
+ EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' ');
+ va_start(ap, fmt);
+ if (vfprintf(dsdt_fp, fmt, ap) < 0) {
+ va_end(ap);
+ goto err_exit;
+ }
+ va_end(ap);
+ }
+ EFPRINTF(dsdt_fp, "\n");
+ return;
+
+err_exit:
+ dsdt_error = errno;
+}
+
+void
+dsdt_indent(int levels)
+{
+
+ dsdt_indent_level += levels;
+ assert(dsdt_indent_level >= 0);
+}
+
+void
+dsdt_unindent(int levels)
+{
+
+ assert(dsdt_indent_level >= levels);
+ dsdt_indent_level -= levels;
+}
+
+void
+dsdt_fixed_ioport(uint16_t iobase, uint16_t length)
+{
+
+ dsdt_line("IO (Decode16,");
+ dsdt_line(" 0x%04X, // Range Minimum", iobase);
+ dsdt_line(" 0x%04X, // Range Maximum", iobase);
+ dsdt_line(" 0x01, // Alignment");
+ dsdt_line(" 0x%02X, // Length", length);
+ dsdt_line(" )");
+}
+
+void
+dsdt_fixed_irq(uint8_t irq)
+{
+
+ dsdt_line("IRQNoFlags ()");
+ dsdt_line(" {%d}", irq);
+}
+
+void
+dsdt_fixed_mem32(uint32_t base, uint32_t length)
+{
+
+ dsdt_line("Memory32Fixed (ReadWrite,");
+ dsdt_line(" 0x%08X, // Address Base", base);
+ dsdt_line(" 0x%08X, // Address Length", length);
+ dsdt_line(" )");
+}
+
+static int
+basl_fwrite_dsdt(FILE *fp)
+{
+ dsdt_fp = fp;
+ dsdt_error = 0;
+ dsdt_indent_level = 0;
+
+ dsdt_line("/*");
+ dsdt_line(" * bhyve DSDT template");
+ dsdt_line(" */");
+ dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
+ "\"BHYVE \", \"BVDSDT \", 0x00000001)");
+ dsdt_line("{");
+ dsdt_line(" Name (_S5, Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x05,");
+ dsdt_line(" Zero,");
+ dsdt_line(" })");
+
+ pci_write_dsdt();
+
+ dsdt_line("");
+ dsdt_line(" Scope (_SB.PC00)");
+ dsdt_line(" {");
+ dsdt_line(" Device (HPET)");
+ dsdt_line(" {");
+ dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))");
+ dsdt_line(" Name (_UID, 0)");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(4);
+ dsdt_fixed_mem32(0xFED00000, 0x400);
+ dsdt_unindent(4);
+ dsdt_line(" })");
+ dsdt_line(" }");
+ dsdt_line(" }");
+ dsdt_line("}");
+
+ if (dsdt_error != 0)
+ return (dsdt_error);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_open(struct basl_fio *bf, int suffix)
+{
+ int err;
+
+ err = 0;
+
+ if (suffix) {
+ strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN);
+ bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
+ } else {
+ strlcpy(bf->f_name, basl_template, MAXPATHLEN);
+ bf->fd = mkstemp(bf->f_name);
+ }
+
+ if (bf->fd > 0) {
+ bf->fp = fdopen(bf->fd, "w+");
+ if (bf->fp == NULL) {
+ unlink(bf->f_name);
+ close(bf->fd);
+ }
+ } else {
+ err = 1;
+ }
+
+ return (err);
+}
+
+static void
+basl_close(struct basl_fio *bf)
+{
+
+ if (!basl_keep_temps)
+ unlink(bf->f_name);
+ fclose(bf->fp);
+}
+
+static int
+basl_start(struct basl_fio *in, struct basl_fio *out)
+{
+ int err;
+
+ err = basl_open(in, 0);
+ if (!err) {
+ err = basl_open(out, 1);
+ if (err) {
+ basl_close(in);
+ }
+ }
+
+ return (err);
+}
+
+static void
+basl_end(struct basl_fio *in, struct basl_fio *out)
+{
+
+ basl_close(in);
+ basl_close(out);
+}
+
+static int
+basl_load(struct vmctx *ctx, int fd, uint64_t off)
+{
+ struct stat sb;
+ void *gaddr;
+
+ if (fstat(fd, &sb) < 0)
+ return (errno);
+
+ gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size);
+ if (gaddr == NULL)
+ return (EFAULT);
+
+ if (read(fd, gaddr, sb.st_size) < 0)
+ return (errno);
+
+ return (0);
+}
+
+static int
+basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset)
+{
+ struct basl_fio io[2];
+ static char iaslbuf[3*MAXPATHLEN + 10];
+ char *fmt;
+ int err;
+
+ err = basl_start(&io[0], &io[1]);
+ if (!err) {
+ err = (*fwrite_section)(io[0].fp);
+
+ if (!err) {
+ /*
+ * iasl sends the results of the compilation to
+ * stdout. Shut this down by using the shell to
+ * redirect stdout to /dev/null, unless the user
+ * has requested verbose output for debugging
+ * purposes
+ */
+ fmt = basl_verbose_iasl ?
+ "%s -p %s %s" :
+ "/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
+
+ snprintf(iaslbuf, sizeof(iaslbuf),
+ fmt,
+ BHYVE_ASL_COMPILER,
+ io[1].f_name, io[0].f_name);
+ err = system(iaslbuf);
+
+ if (!err) {
+ /*
+ * Copy the aml output file into guest
+ * memory at the specified location
+ */
+ err = basl_load(ctx, io[1].fd, offset);
+ }
+ }
+ basl_end(&io[0], &io[1]);
+ }
+
+ return (err);
+}
+
+static int
+basl_make_templates(void)
+{
+ const char *tmpdir;
+ int err;
+ int len;
+
+ err = 0;
+
+ /*
+ *
+ */
+ if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
+ (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
+ tmpdir = _PATH_TMP;
+ }
+
+ len = strlen(tmpdir);
+
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
+ strcpy(basl_template, tmpdir);
+ while (len > 0 && basl_template[len - 1] == '/')
+ len--;
+ basl_template[len] = '/';
+ strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
+ } else
+ err = E2BIG;
+
+ if (!err) {
+ /*
+ * len has been intialized (and maybe adjusted) above
+ */
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
+ sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
+ strcpy(basl_stemplate, tmpdir);
+ basl_stemplate[len] = '/';
+ strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
+ len = strlen(basl_stemplate);
+ strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
+ } else
+ err = E2BIG;
+ }
+
+ return (err);
+}
+
+static struct {
+ int (*wsect)(FILE *fp);
+ uint64_t offset;
+} basl_ftables[] =
+{
+ { basl_fwrite_rsdp, 0},
+ { basl_fwrite_rsdt, RSDT_OFFSET },
+ { basl_fwrite_xsdt, XSDT_OFFSET },
+ { basl_fwrite_madt, MADT_OFFSET },
+ { basl_fwrite_fadt, FADT_OFFSET },
+ { basl_fwrite_hpet, HPET_OFFSET },
+ { basl_fwrite_mcfg, MCFG_OFFSET },
+ { basl_fwrite_facs, FACS_OFFSET },
+ { basl_fwrite_dsdt, DSDT_OFFSET },
+ { NULL }
+};
+
+int
+acpi_build(struct vmctx *ctx, int ncpu)
+{
+ int err;
+ int i;
+
+ basl_ncpu = ncpu;
+
+ err = vm_get_hpet_capabilities(ctx, &hpet_capabilities);
+ if (err != 0)
+ return (err);
+
+ /*
+ * For debug, allow the user to have iasl compiler output sent
+ * to stdout rather than /dev/null
+ */
+ if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
+ basl_verbose_iasl = 1;
+
+ /*
+ * Allow the user to keep the generated ASL files for debugging
+ * instead of deleting them following use
+ */
+ if (getenv("BHYVE_ACPI_KEEPTMPS"))
+ basl_keep_temps = 1;
+
+ i = 0;
+ err = basl_make_templates();
+
+ /*
+ * Run through all the ASL files, compiling them and
+ * copying them into guest memory
+ */
+ while (!err && basl_ftables[i].wsect != NULL) {
+ err = basl_compile(ctx, basl_ftables[i].wsect,
+ basl_ftables[i].offset);
+ i++;
+ }
+
+ return (err);
+}
diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h
new file mode 100644
index 0000000000..4c6d86d091
--- /dev/null
+++ b/usr/src/cmd/bhyve/acpi.h
@@ -0,0 +1,56 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+#define SCI_INT 9
+
+#define SMI_CMD 0xb2
+#define BHYVE_ACPI_ENABLE 0xa0
+#define BHYVE_ACPI_DISABLE 0xa1
+
+#define PM1A_EVT_ADDR 0x400
+#define PM1A_CNT_ADDR 0x404
+
+#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
+
+struct vmctx;
+
+int acpi_build(struct vmctx *ctx, int ncpu);
+void dsdt_line(const char *fmt, ...);
+void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
+void dsdt_fixed_irq(uint8_t irq);
+void dsdt_fixed_mem32(uint32_t base, uint32_t length);
+void dsdt_indent(int levels);
+void dsdt_unindent(int levels);
+void sci_init(struct vmctx *ctx);
+
+#endif /* _ACPI_H_ */
diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h
new file mode 100644
index 0000000000..691d4bd438
--- /dev/null
+++ b/usr/src/cmd/bhyve/ahci.h
@@ -0,0 +1,324 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
+ * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AHCI_H_
+#define _AHCI_H_
+
+/* ATA register defines */
+#define ATA_DATA 0 /* (RW) data */
+
+#define ATA_FEATURE 1 /* (W) feature */
+#define ATA_F_DMA 0x01 /* enable DMA */
+#define ATA_F_OVL 0x02 /* enable overlap */
+
+#define ATA_COUNT 2 /* (W) sector count */
+
+#define ATA_SECTOR 3 /* (RW) sector # */
+#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
+#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
+#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
+#define ATA_D_LBA 0x40 /* use LBA addressing */
+#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */
+
+#define ATA_COMMAND 7 /* (W) command */
+
+#define ATA_ERROR 8 /* (R) error */
+#define ATA_E_ILI 0x01 /* illegal length */
+#define ATA_E_NM 0x02 /* no media */
+#define ATA_E_ABORT 0x04 /* command aborted */
+#define ATA_E_MCR 0x08 /* media change request */
+#define ATA_E_IDNF 0x10 /* ID not found */
+#define ATA_E_MC 0x20 /* media changed */
+#define ATA_E_UNC 0x40 /* uncorrectable data */
+#define ATA_E_ICRC 0x80 /* UDMA crc error */
+#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */
+
+#define ATA_IREASON 9 /* (R) interrupt reason */
+#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */
+#define ATA_I_IN 0x02 /* read (1) | write (0) */
+#define ATA_I_RELEASE 0x04 /* released bus (1) */
+#define ATA_I_TAGMASK 0xf8 /* tag mask */
+
+#define ATA_STATUS 10 /* (R) status */
+#define ATA_ALTSTAT 11 /* (R) alternate status */
+#define ATA_S_ERROR 0x01 /* error */
+#define ATA_S_INDEX 0x02 /* index */
+#define ATA_S_CORR 0x04 /* data corrected */
+#define ATA_S_DRQ 0x08 /* data request */
+#define ATA_S_DSC 0x10 /* drive seek completed */
+#define ATA_S_SERVICE 0x10 /* drive needs service */
+#define ATA_S_DWF 0x20 /* drive write fault */
+#define ATA_S_DMA 0x20 /* DMA ready */
+#define ATA_S_READY 0x40 /* drive ready */
+#define ATA_S_BUSY 0x80 /* busy */
+
+#define ATA_CONTROL 12 /* (W) control */
+#define ATA_A_IDS 0x02 /* disable interrupts */
+#define ATA_A_RESET 0x04 /* RESET controller */
+#define ATA_A_4BIT 0x08 /* 4 head bits */
+#define ATA_A_HOB 0x80 /* High Order Byte enable */
+
+/* SATA register defines */
+#define ATA_SSTATUS 13
+#define ATA_SS_DET_MASK 0x0000000f
+#define ATA_SS_DET_NO_DEVICE 0x00000000
+#define ATA_SS_DET_DEV_PRESENT 0x00000001
+#define ATA_SS_DET_PHY_ONLINE 0x00000003
+#define ATA_SS_DET_PHY_OFFLINE 0x00000004
+
+#define ATA_SS_SPD_MASK 0x000000f0
+#define ATA_SS_SPD_NO_SPEED 0x00000000
+#define ATA_SS_SPD_GEN1 0x00000010
+#define ATA_SS_SPD_GEN2 0x00000020
+#define ATA_SS_SPD_GEN3 0x00000030
+
+#define ATA_SS_IPM_MASK 0x00000f00
+#define ATA_SS_IPM_NO_DEVICE 0x00000000
+#define ATA_SS_IPM_ACTIVE 0x00000100
+#define ATA_SS_IPM_PARTIAL 0x00000200
+#define ATA_SS_IPM_SLUMBER 0x00000600
+#define ATA_SS_IPM_DEVSLEEP 0x00000800
+
+#define ATA_SERROR 14
+#define ATA_SE_DATA_CORRECTED 0x00000001
+#define ATA_SE_COMM_CORRECTED 0x00000002
+#define ATA_SE_DATA_ERR 0x00000100
+#define ATA_SE_COMM_ERR 0x00000200
+#define ATA_SE_PROT_ERR 0x00000400
+#define ATA_SE_HOST_ERR 0x00000800
+#define ATA_SE_PHY_CHANGED 0x00010000
+#define ATA_SE_PHY_IERROR 0x00020000
+#define ATA_SE_COMM_WAKE 0x00040000
+#define ATA_SE_DECODE_ERR 0x00080000
+#define ATA_SE_PARITY_ERR 0x00100000
+#define ATA_SE_CRC_ERR 0x00200000
+#define ATA_SE_HANDSHAKE_ERR 0x00400000
+#define ATA_SE_LINKSEQ_ERR 0x00800000
+#define ATA_SE_TRANSPORT_ERR 0x01000000
+#define ATA_SE_UNKNOWN_FIS 0x02000000
+#define ATA_SE_EXCHANGED 0x04000000
+
+#define ATA_SCONTROL 15
+#define ATA_SC_DET_MASK 0x0000000f
+#define ATA_SC_DET_IDLE 0x00000000
+#define ATA_SC_DET_RESET 0x00000001
+#define ATA_SC_DET_DISABLE 0x00000004
+
+#define ATA_SC_SPD_MASK 0x000000f0
+#define ATA_SC_SPD_NO_SPEED 0x00000000
+#define ATA_SC_SPD_SPEED_GEN1 0x00000010
+#define ATA_SC_SPD_SPEED_GEN2 0x00000020
+#define ATA_SC_SPD_SPEED_GEN3 0x00000030
+
+#define ATA_SC_IPM_MASK 0x00000f00
+#define ATA_SC_IPM_NONE 0x00000000
+#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
+#define ATA_SC_IPM_DIS_SLUMBER 0x00000200
+#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400
+
+#define ATA_SACTIVE 16
+
+#define AHCI_MAX_PORTS 32
+#define AHCI_MAX_SLOTS 32
+#define AHCI_MAX_IRQS 16
+
+/* SATA AHCI v1.0 register defines */
+#define AHCI_CAP 0x00
+#define AHCI_CAP_NPMASK 0x0000001f
+#define AHCI_CAP_SXS 0x00000020
+#define AHCI_CAP_EMS 0x00000040
+#define AHCI_CAP_CCCS 0x00000080
+#define AHCI_CAP_NCS 0x00001F00
+#define AHCI_CAP_NCS_SHIFT 8
+#define AHCI_CAP_PSC 0x00002000
+#define AHCI_CAP_SSC 0x00004000
+#define AHCI_CAP_PMD 0x00008000
+#define AHCI_CAP_FBSS 0x00010000
+#define AHCI_CAP_SPM 0x00020000
+#define AHCI_CAP_SAM 0x00080000
+#define AHCI_CAP_ISS 0x00F00000
+#define AHCI_CAP_ISS_SHIFT 20
+#define AHCI_CAP_SCLO 0x01000000
+#define AHCI_CAP_SAL 0x02000000
+#define AHCI_CAP_SALP 0x04000000
+#define AHCI_CAP_SSS 0x08000000
+#define AHCI_CAP_SMPS 0x10000000
+#define AHCI_CAP_SSNTF 0x20000000
+#define AHCI_CAP_SNCQ 0x40000000
+#define AHCI_CAP_64BIT 0x80000000
+
+#define AHCI_GHC 0x04
+#define AHCI_GHC_AE 0x80000000
+#define AHCI_GHC_MRSM 0x00000004
+#define AHCI_GHC_IE 0x00000002
+#define AHCI_GHC_HR 0x00000001
+
+#define AHCI_IS 0x08
+#define AHCI_PI 0x0c
+#define AHCI_VS 0x10
+
+#define AHCI_CCCC 0x14
+#define AHCI_CCCC_TV_MASK 0xffff0000
+#define AHCI_CCCC_TV_SHIFT 16
+#define AHCI_CCCC_CC_MASK 0x0000ff00
+#define AHCI_CCCC_CC_SHIFT 8
+#define AHCI_CCCC_INT_MASK 0x000000f8
+#define AHCI_CCCC_INT_SHIFT 3
+#define AHCI_CCCC_EN 0x00000001
+#define AHCI_CCCP 0x18
+
+#define AHCI_EM_LOC 0x1C
+#define AHCI_EM_CTL 0x20
+#define AHCI_EM_MR 0x00000001
+#define AHCI_EM_TM 0x00000100
+#define AHCI_EM_RST 0x00000200
+#define AHCI_EM_LED 0x00010000
+#define AHCI_EM_SAFTE 0x00020000
+#define AHCI_EM_SES2 0x00040000
+#define AHCI_EM_SGPIO 0x00080000
+#define AHCI_EM_SMB 0x01000000
+#define AHCI_EM_XMT 0x02000000
+#define AHCI_EM_ALHD 0x04000000
+#define AHCI_EM_PM 0x08000000
+
+#define AHCI_CAP2 0x24
+#define AHCI_CAP2_BOH 0x00000001
+#define AHCI_CAP2_NVMP 0x00000002
+#define AHCI_CAP2_APST 0x00000004
+#define AHCI_CAP2_SDS 0x00000008
+#define AHCI_CAP2_SADM 0x00000010
+#define AHCI_CAP2_DESO 0x00000020
+
+#define AHCI_OFFSET 0x100
+#define AHCI_STEP 0x80
+
+#define AHCI_P_CLB 0x00
+#define AHCI_P_CLBU 0x04
+#define AHCI_P_FB 0x08
+#define AHCI_P_FBU 0x0c
+#define AHCI_P_IS 0x10
+#define AHCI_P_IE 0x14
+#define AHCI_P_IX_DHR 0x00000001
+#define AHCI_P_IX_PS 0x00000002
+#define AHCI_P_IX_DS 0x00000004
+#define AHCI_P_IX_SDB 0x00000008
+#define AHCI_P_IX_UF 0x00000010
+#define AHCI_P_IX_DP 0x00000020
+#define AHCI_P_IX_PC 0x00000040
+#define AHCI_P_IX_MP 0x00000080
+
+#define AHCI_P_IX_PRC 0x00400000
+#define AHCI_P_IX_IPM 0x00800000
+#define AHCI_P_IX_OF 0x01000000
+#define AHCI_P_IX_INF 0x04000000
+#define AHCI_P_IX_IF 0x08000000
+#define AHCI_P_IX_HBD 0x10000000
+#define AHCI_P_IX_HBF 0x20000000
+#define AHCI_P_IX_TFE 0x40000000
+#define AHCI_P_IX_CPD 0x80000000
+
+#define AHCI_P_CMD 0x18
+#define AHCI_P_CMD_ST 0x00000001
+#define AHCI_P_CMD_SUD 0x00000002
+#define AHCI_P_CMD_POD 0x00000004
+#define AHCI_P_CMD_CLO 0x00000008
+#define AHCI_P_CMD_FRE 0x00000010
+#define AHCI_P_CMD_CCS_MASK 0x00001f00
+#define AHCI_P_CMD_CCS_SHIFT 8
+#define AHCI_P_CMD_ISS 0x00002000
+#define AHCI_P_CMD_FR 0x00004000
+#define AHCI_P_CMD_CR 0x00008000
+#define AHCI_P_CMD_CPS 0x00010000
+#define AHCI_P_CMD_PMA 0x00020000
+#define AHCI_P_CMD_HPCP 0x00040000
+#define AHCI_P_CMD_MPSP 0x00080000
+#define AHCI_P_CMD_CPD 0x00100000
+#define AHCI_P_CMD_ESP 0x00200000
+#define AHCI_P_CMD_FBSCP 0x00400000
+#define AHCI_P_CMD_APSTE 0x00800000
+#define AHCI_P_CMD_ATAPI 0x01000000
+#define AHCI_P_CMD_DLAE 0x02000000
+#define AHCI_P_CMD_ALPE 0x04000000
+#define AHCI_P_CMD_ASP 0x08000000
+#define AHCI_P_CMD_ICC_MASK 0xf0000000
+#define AHCI_P_CMD_NOOP 0x00000000
+#define AHCI_P_CMD_ACTIVE 0x10000000
+#define AHCI_P_CMD_PARTIAL 0x20000000
+#define AHCI_P_CMD_SLUMBER 0x60000000
+#define AHCI_P_CMD_DEVSLEEP 0x80000000
+
+#define AHCI_P_TFD 0x20
+#define AHCI_P_SIG 0x24
+#define AHCI_P_SSTS 0x28
+#define AHCI_P_SCTL 0x2c
+#define AHCI_P_SERR 0x30
+#define AHCI_P_SACT 0x34
+#define AHCI_P_CI 0x38
+#define AHCI_P_SNTF 0x3C
+#define AHCI_P_FBS 0x40
+#define AHCI_P_FBS_EN 0x00000001
+#define AHCI_P_FBS_DEC 0x00000002
+#define AHCI_P_FBS_SDE 0x00000004
+#define AHCI_P_FBS_DEV 0x00000f00
+#define AHCI_P_FBS_DEV_SHIFT 8
+#define AHCI_P_FBS_ADO 0x0000f000
+#define AHCI_P_FBS_ADO_SHIFT 12
+#define AHCI_P_FBS_DWE 0x000f0000
+#define AHCI_P_FBS_DWE_SHIFT 16
+#define AHCI_P_DEVSLP 0x44
+#define AHCI_P_DEVSLP_ADSE 0x00000001
+#define AHCI_P_DEVSLP_DSP 0x00000002
+#define AHCI_P_DEVSLP_DETO 0x000003fc
+#define AHCI_P_DEVSLP_DETO_SHIFT 2
+#define AHCI_P_DEVSLP_MDAT 0x00007c00
+#define AHCI_P_DEVSLP_MDAT_SHIFT 10
+#define AHCI_P_DEVSLP_DITO 0x01ff8000
+#define AHCI_P_DEVSLP_DITO_SHIFT 15
+#define AHCI_P_DEVSLP_DM 0x0e000000
+#define AHCI_P_DEVSLP_DM_SHIFT 25
+
+/* Just to be sure, if building as module. */
+#if MAXPHYS < 512 * 1024
+#undef MAXPHYS
+#define MAXPHYS 512 * 1024
+#endif
+/* Pessimistic prognosis on number of required S/G entries */
+#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8))
+/* Command list. 32 commands. First, 1Kbyte aligned. */
+#define AHCI_CL_OFFSET 0
+#define AHCI_CL_SIZE 32
+/* Command tables. Up to 32 commands, Each, 128byte aligned. */
+#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
+#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16)
+/* Total main work area. */
+#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
+
+#endif /* _AHCI_H_ */
diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c
new file mode 100644
index 0000000000..1c1838c2e8
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.c
@@ -0,0 +1,586 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "acpi.h"
+#include "atkbdc.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "ps2kbd.h"
+#include "ps2mouse.h"
+
+#define KBD_DATA_PORT 0x60
+
+#define KBD_STS_CTL_PORT 0x64
+
+#define KBDC_RESET 0xfe
+
+#define KBD_DEV_IRQ 1
+#define AUX_DEV_IRQ 12
+
+/* controller commands */
+#define KBDC_SET_COMMAND_BYTE 0x60
+#define KBDC_GET_COMMAND_BYTE 0x20
+#define KBDC_DISABLE_AUX_PORT 0xa7
+#define KBDC_ENABLE_AUX_PORT 0xa8
+#define KBDC_TEST_AUX_PORT 0xa9
+#define KBDC_TEST_CTRL 0xaa
+#define KBDC_TEST_KBD_PORT 0xab
+#define KBDC_DISABLE_KBD_PORT 0xad
+#define KBDC_ENABLE_KBD_PORT 0xae
+#define KBDC_READ_INPORT 0xc0
+#define KBDC_READ_OUTPORT 0xd0
+#define KBDC_WRITE_OUTPORT 0xd1
+#define KBDC_WRITE_KBD_OUTBUF 0xd2
+#define KBDC_WRITE_AUX_OUTBUF 0xd3
+#define KBDC_WRITE_TO_AUX 0xd4
+
+/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */
+#define KBD_TRANSLATION 0x40
+#define KBD_SYS_FLAG_BIT 0x04
+#define KBD_DISABLE_KBD_PORT 0x10
+#define KBD_DISABLE_AUX_PORT 0x20
+#define KBD_ENABLE_AUX_INT 0x02
+#define KBD_ENABLE_KBD_INT 0x01
+#define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT)
+#define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT)
+
+/* controller status bits */
+#define KBDS_KBD_BUFFER_FULL 0x01
+#define KBDS_SYS_FLAG 0x04
+#define KBDS_CTRL_FLAG 0x08
+#define KBDS_AUX_BUFFER_FULL 0x20
+
+/* controller output port */
+#define KBDO_KBD_OUTFULL 0x10
+#define KBDO_AUX_OUTFULL 0x20
+
+#define RAMSZ 32
+#define FIFOSZ 15
+#define CTRL_CMD_FLAG 0x8000
+
+struct kbd_dev {
+ bool irq_active;
+ int irq;
+
+ uint8_t buffer[FIFOSZ];
+ int brd, bwr;
+ int bcnt;
+};
+
+struct aux_dev {
+ bool irq_active;
+ int irq;
+};
+
+struct atkbdc_softc {
+ struct vmctx *ctx;
+ pthread_mutex_t mtx;
+
+ struct ps2kbd_softc *ps2kbd_sc;
+ struct ps2mouse_softc *ps2mouse_sc;
+
+ uint8_t status; /* status register */
+ uint8_t outport; /* controller output port */
+ uint8_t ram[RAMSZ]; /* byte0 = controller config */
+
+ uint32_t curcmd; /* current command for next byte */
+ uint32_t ctrlbyte;
+
+ struct kbd_dev kbd;
+ struct aux_dev aux;
+};
+
+static void
+atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
+{
+ if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
+ sc->kbd.irq_active = true;
+ vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
+ }
+}
+
+static void
+atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
+{
+ if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
+ sc->aux.irq_active = true;
+ vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+ }
+}
+
+static int
+atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (sc->kbd.bcnt < FIFOSZ) {
+ sc->kbd.buffer[sc->kbd.bwr] = val;
+ sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ;
+ sc->kbd.bcnt++;
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ sc->outport |= KBDO_KBD_OUTFULL;
+ } else {
+ printf("atkbd data buffer full\n");
+ }
+
+ return (sc->kbd.bcnt < FIFOSZ);
+}
+
+static void
+atkbdc_kbd_read(struct atkbdc_softc *sc)
+{
+ const uint8_t translation[256] = {
+ 0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58,
+ 0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59,
+ 0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a,
+ 0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b,
+ 0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c,
+ 0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d,
+ 0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e,
+ 0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f,
+ 0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60,
+ 0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61,
+ 0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e,
+ 0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76,
+ 0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b,
+ 0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f,
+ 0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45,
+ 0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54,
+ 0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+ };
+ uint8_t val;
+ uint8_t release = 0;
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (sc->ram[0] & KBD_TRANSLATION) {
+ while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) {
+ if (val == 0xf0) {
+ release = 0x80;
+ continue;
+ } else {
+ val = translation[val] | release;
+ }
+ atkbdc_kbd_queue_data(sc, val);
+ break;
+ }
+ } else {
+ while (sc->kbd.bcnt < FIFOSZ) {
+ if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
+ atkbdc_kbd_queue_data(sc, val);
+ else
+ break;
+ }
+ }
+
+ if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) ||
+ ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0)
+ atkbdc_assert_kbd_intr(sc);
+}
+
+static void
+atkbdc_aux_poll(struct atkbdc_softc *sc)
+{
+ if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) {
+ sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
+ sc->outport |= KBDO_AUX_OUTFULL;
+ atkbdc_assert_aux_intr(sc);
+ }
+}
+
+static void
+atkbdc_kbd_poll(struct atkbdc_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ atkbdc_kbd_read(sc);
+}
+
+static void
+atkbdc_poll(struct atkbdc_softc *sc)
+{
+ atkbdc_aux_poll(sc);
+ atkbdc_kbd_poll(sc);
+}
+
+static void
+atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) {
+ if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) {
+ if (sc->kbd.bcnt == 0)
+ sc->status &= ~(KBDS_AUX_BUFFER_FULL |
+ KBDS_KBD_BUFFER_FULL);
+ else
+ sc->status &= ~(KBDS_AUX_BUFFER_FULL);
+ sc->outport &= ~KBDO_AUX_OUTFULL;
+ }
+
+ atkbdc_poll(sc);
+ return;
+ }
+
+ if (sc->kbd.bcnt > 0) {
+ *buf = sc->kbd.buffer[sc->kbd.brd];
+ sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ;
+ sc->kbd.bcnt--;
+ if (sc->kbd.bcnt == 0) {
+ sc->status &= ~KBDS_KBD_BUFFER_FULL;
+ sc->outport &= ~KBDO_KBD_OUTFULL;
+ }
+
+ atkbdc_poll(sc);
+ }
+
+ if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) {
+ sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+ }
+}
+
+static int
+atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct atkbdc_softc *sc;
+ uint8_t buf;
+ int retval;
+
+ if (bytes != 1)
+ return (-1);
+ sc = arg;
+ retval = 0;
+
+ pthread_mutex_lock(&sc->mtx);
+ if (in) {
+ sc->curcmd = 0;
+ if (sc->ctrlbyte != 0) {
+ *eax = sc->ctrlbyte & 0xff;
+ sc->ctrlbyte = 0;
+ } else {
+ /* read device buffer; includes kbd cmd responses */
+ atkbdc_dequeue_data(sc, &buf);
+ *eax = buf;
+ }
+
+ sc->status &= ~KBDS_CTRL_FLAG;
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+ if (sc->status & KBDS_CTRL_FLAG) {
+ /*
+ * Command byte for the controller.
+ */
+ switch (sc->curcmd) {
+ case KBDC_SET_COMMAND_BYTE:
+ sc->ram[0] = *eax;
+ if (sc->ram[0] & KBD_SYS_FLAG_BIT)
+ sc->status |= KBDS_SYS_FLAG;
+ else
+ sc->status &= ~KBDS_SYS_FLAG;
+ break;
+ case KBDC_WRITE_OUTPORT:
+ sc->outport = *eax;
+ break;
+ case KBDC_WRITE_TO_AUX:
+ ps2mouse_write(sc->ps2mouse_sc, *eax, 0);
+ atkbdc_poll(sc);
+ break;
+ case KBDC_WRITE_KBD_OUTBUF:
+ atkbdc_kbd_queue_data(sc, *eax);
+ break;
+ case KBDC_WRITE_AUX_OUTBUF:
+ ps2mouse_write(sc->ps2mouse_sc, *eax, 1);
+ sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+ atkbdc_aux_poll(sc);
+ break;
+ default:
+ /* write to particular RAM byte */
+ if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) {
+ int byten;
+
+ byten = (sc->curcmd - 0x60) & 0x1f;
+ sc->ram[byten] = *eax & 0xff;
+ }
+ break;
+ }
+
+ sc->curcmd = 0;
+ sc->status &= ~KBDS_CTRL_FLAG;
+
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+ /*
+ * Data byte for the device.
+ */
+ ps2kbd_write(sc->ps2kbd_sc, *eax);
+ atkbdc_poll(sc);
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+static int
+atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg)
+{
+ struct atkbdc_softc *sc;
+ int error, retval;
+
+ if (bytes != 1)
+ return (-1);
+
+ sc = arg;
+ retval = 0;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (in) {
+ /* read status register */
+ *eax = sc->status;
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+
+ sc->curcmd = 0;
+ sc->status |= KBDS_CTRL_FLAG;
+ sc->ctrlbyte = 0;
+
+ switch (*eax) {
+ case KBDC_GET_COMMAND_BYTE:
+ sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0];
+ break;
+ case KBDC_TEST_CTRL:
+ sc->ctrlbyte = CTRL_CMD_FLAG | 0x55;
+ break;
+ case KBDC_TEST_AUX_PORT:
+ case KBDC_TEST_KBD_PORT:
+ sc->ctrlbyte = CTRL_CMD_FLAG | 0;
+ break;
+ case KBDC_READ_INPORT:
+ sc->ctrlbyte = CTRL_CMD_FLAG | 0;
+ break;
+ case KBDC_READ_OUTPORT:
+ sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport;
+ break;
+ case KBDC_SET_COMMAND_BYTE:
+ case KBDC_WRITE_OUTPORT:
+ case KBDC_WRITE_KBD_OUTBUF:
+ case KBDC_WRITE_AUX_OUTBUF:
+ sc->curcmd = *eax;
+ break;
+ case KBDC_DISABLE_KBD_PORT:
+ sc->ram[0] |= KBD_DISABLE_KBD_PORT;
+ break;
+ case KBDC_ENABLE_KBD_PORT:
+ sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
+ if (sc->kbd.bcnt > 0)
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ atkbdc_poll(sc);
+ break;
+ case KBDC_WRITE_TO_AUX:
+ sc->curcmd = *eax;
+ break;
+ case KBDC_DISABLE_AUX_PORT:
+ sc->ram[0] |= KBD_DISABLE_AUX_PORT;
+ ps2mouse_toggle(sc->ps2mouse_sc, 0);
+ sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+ sc->outport &= ~KBDS_AUX_BUFFER_FULL;
+ break;
+ case KBDC_ENABLE_AUX_PORT:
+ sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
+ ps2mouse_toggle(sc->ps2mouse_sc, 1);
+ if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0)
+ sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
+ break;
+ case KBDC_RESET: /* Pulse "reset" line */
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
+ break;
+ default:
+ if (*eax >= 0x21 && *eax <= 0x3f) {
+ /* read "byte N" from RAM */
+ int byten;
+
+ byten = (*eax - 0x20) & 0x1f;
+ sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten];
+ }
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ if (sc->ctrlbyte != 0) {
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ sc->status &= ~KBDS_AUX_BUFFER_FULL;
+ atkbdc_assert_kbd_intr(sc);
+ } else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 &&
+ (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) {
+ sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
+ atkbdc_assert_aux_intr(sc);
+ } else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) {
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ atkbdc_assert_kbd_intr(sc);
+ }
+
+ return (retval);
+}
+
+void
+atkbdc_event(struct atkbdc_softc *sc, int iskbd)
+{
+ pthread_mutex_lock(&sc->mtx);
+
+ if (iskbd)
+ atkbdc_kbd_poll(sc);
+ else
+ atkbdc_aux_poll(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+atkbdc_init(struct vmctx *ctx)
+{
+ struct inout_port iop;
+ struct atkbdc_softc *sc;
+ int error;
+
+ sc = calloc(1, sizeof(struct atkbdc_softc));
+ sc->ctx = ctx;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "atkdbc";
+ iop.port = KBD_STS_CTL_PORT;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = atkbdc_sts_ctl_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "atkdbc";
+ iop.port = KBD_DATA_PORT;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = atkbdc_data_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+
+ pci_irq_reserve(KBD_DEV_IRQ);
+ sc->kbd.irq = KBD_DEV_IRQ;
+
+ pci_irq_reserve(AUX_DEV_IRQ);
+ sc->aux.irq = AUX_DEV_IRQ;
+
+ sc->ps2kbd_sc = ps2kbd_init(sc);
+ sc->ps2mouse_sc = ps2mouse_init(sc);
+}
+
+static void
+atkbdc_dsdt(void)
+{
+
+ dsdt_line("");
+ dsdt_line("Device (KBD)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+ dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+ dsdt_fixed_irq(1);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+
+ dsdt_line("");
+ dsdt_line("Device (MOU)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+ dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+ dsdt_fixed_irq(12);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(atkbdc_dsdt);
+
diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h
new file mode 100644
index 0000000000..85c8a7141e
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ATKBDC_H_
+#define _ATKBDC_H_
+
+struct atkbdc_softc;
+struct vmctx;
+
+void atkbdc_init(struct vmctx *ctx);
+void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
+
+#endif /* _ATKBDC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c
new file mode 100644
index 0000000000..7b24ea7f5d
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/uio.h>
+
+#include <termios.h>
+#include <unistd.h>
+
+/*
+ * Make a pre-existing termios structure into "raw" mode: character-at-a-time
+ * mode with no characters interpreted, 8-bit data path.
+ */
+void
+cfmakeraw(struct termios *t)
+{
+ t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|
+ ICRNL|IXON|IGNPAR);
+ t->c_iflag |= IGNBRK;
+ t->c_oflag &= ~OPOST;
+ t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|
+ TOSTOP|PENDIN);
+ t->c_cflag &= ~(CSIZE|PARENB);
+ t->c_cflag |= CS8|CREAD;
+ t->c_cc[VMIN] = 1;
+ t->c_cc[VTIME] = 0;
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c
new file mode 100644
index 0000000000..4bd49ded79
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.c
@@ -0,0 +1,103 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyvegc.h"
+
+struct bhyvegc {
+ struct bhyvegc_image *gc_image;
+ int raw;
+};
+
+struct bhyvegc *
+bhyvegc_init(int width, int height, void *fbaddr)
+{
+ struct bhyvegc *gc;
+ struct bhyvegc_image *gc_image;
+
+ gc = calloc(1, sizeof (struct bhyvegc));
+
+ gc_image = calloc(1, sizeof(struct bhyvegc_image));
+ gc_image->width = width;
+ gc_image->height = height;
+ if (fbaddr) {
+ gc_image->data = fbaddr;
+ gc->raw = 1;
+ } else {
+ gc_image->data = calloc(width * height, sizeof (uint32_t));
+ gc->raw = 0;
+ }
+
+ gc->gc_image = gc_image;
+
+ return (gc);
+}
+
+void
+bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr)
+{
+ gc->raw = 1;
+ if (gc->gc_image->data && gc->gc_image->data != fbaddr)
+ free(gc->gc_image->data);
+ gc->gc_image->data = fbaddr;
+}
+
+void
+bhyvegc_resize(struct bhyvegc *gc, int width, int height)
+{
+ struct bhyvegc_image *gc_image;
+
+ gc_image = gc->gc_image;
+
+ gc_image->width = width;
+ gc_image->height = height;
+ if (!gc->raw) {
+ gc_image->data = reallocarray(gc_image->data, width * height,
+ sizeof (uint32_t));
+ if (gc_image->data != NULL)
+ memset(gc_image->data, 0, width * height *
+ sizeof (uint32_t));
+ }
+}
+
+struct bhyvegc_image *
+bhyvegc_get_image(struct bhyvegc *gc)
+{
+ if (gc == NULL)
+ return (NULL);
+
+ return (gc->gc_image);
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h
new file mode 100644
index 0000000000..11323586df
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.h
@@ -0,0 +1,48 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVEGC_H_
+#define _BHYVEGC_H_
+
+struct bhyvegc;
+
+struct bhyvegc_image {
+ int vgamode;
+ int width;
+ int height;
+ uint32_t *data;
+};
+
+struct bhyvegc *bhyvegc_init(int width, int height, void *fbaddr);
+void bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr);
+void bhyvegc_resize(struct bhyvegc *gc, int width, int height);
+struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc);
+
+#endif /* _BHYVEGC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
new file mode 100644
index 0000000000..ccf89b4613
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -0,0 +1,1395 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/cpuset.h>
+
+#ifdef __FreeBSD__
+#include <amd64/vmm/intel/vmcs.h>
+#else
+#include <intel/vmcs.h>
+#endif
+
+#include <machine/atomic.h>
+#include <machine/segments.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <vmmapi.h>
+
+#ifndef __FreeBSD__
+#include <sys/stat.h>
+#endif
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "console.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "fwctl.h"
+#include "gdb.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rfb.h"
+#include "rtc.h"
+#include "vga.h"
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+static const char * const vmx_exit_reason_desc[] = {
+ [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
+ [EXIT_REASON_EXT_INTR] = "External interrupt",
+ [EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
+ [EXIT_REASON_INIT] = "INIT signal",
+ [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
+ [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
+ [EXIT_REASON_SMI] = "Other SMI",
+ [EXIT_REASON_INTR_WINDOW] = "Interrupt window",
+ [EXIT_REASON_NMI_WINDOW] = "NMI window",
+ [EXIT_REASON_TASK_SWITCH] = "Task switch",
+ [EXIT_REASON_CPUID] = "CPUID",
+ [EXIT_REASON_GETSEC] = "GETSEC",
+ [EXIT_REASON_HLT] = "HLT",
+ [EXIT_REASON_INVD] = "INVD",
+ [EXIT_REASON_INVLPG] = "INVLPG",
+ [EXIT_REASON_RDPMC] = "RDPMC",
+ [EXIT_REASON_RDTSC] = "RDTSC",
+ [EXIT_REASON_RSM] = "RSM",
+ [EXIT_REASON_VMCALL] = "VMCALL",
+ [EXIT_REASON_VMCLEAR] = "VMCLEAR",
+ [EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
+ [EXIT_REASON_VMPTRLD] = "VMPTRLD",
+ [EXIT_REASON_VMPTRST] = "VMPTRST",
+ [EXIT_REASON_VMREAD] = "VMREAD",
+ [EXIT_REASON_VMRESUME] = "VMRESUME",
+ [EXIT_REASON_VMWRITE] = "VMWRITE",
+ [EXIT_REASON_VMXOFF] = "VMXOFF",
+ [EXIT_REASON_VMXON] = "VMXON",
+ [EXIT_REASON_CR_ACCESS] = "Control-register accesses",
+ [EXIT_REASON_DR_ACCESS] = "MOV DR",
+ [EXIT_REASON_INOUT] = "I/O instruction",
+ [EXIT_REASON_RDMSR] = "RDMSR",
+ [EXIT_REASON_WRMSR] = "WRMSR",
+ [EXIT_REASON_INVAL_VMCS] =
+ "VM-entry failure due to invalid guest state",
+ [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
+ [EXIT_REASON_MWAIT] = "MWAIT",
+ [EXIT_REASON_MTF] = "Monitor trap flag",
+ [EXIT_REASON_MONITOR] = "MONITOR",
+ [EXIT_REASON_PAUSE] = "PAUSE",
+ [EXIT_REASON_MCE_DURING_ENTRY] =
+ "VM-entry failure due to machine-check event",
+ [EXIT_REASON_TPR] = "TPR below threshold",
+ [EXIT_REASON_APIC_ACCESS] = "APIC access",
+ [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
+ [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
+ [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
+ [EXIT_REASON_EPT_FAULT] = "EPT violation",
+ [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
+ [EXIT_REASON_INVEPT] = "INVEPT",
+ [EXIT_REASON_RDTSCP] = "RDTSCP",
+ [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
+ [EXIT_REASON_INVVPID] = "INVVPID",
+ [EXIT_REASON_WBINVD] = "WBINVD",
+ [EXIT_REASON_XSETBV] = "XSETBV",
+ [EXIT_REASON_APIC_WRITE] = "APIC write",
+ [EXIT_REASON_RDRAND] = "RDRAND",
+ [EXIT_REASON_INVPCID] = "INVPCID",
+ [EXIT_REASON_VMFUNC] = "VMFUNC",
+ [EXIT_REASON_ENCLS] = "ENCLS",
+ [EXIT_REASON_RDSEED] = "RDSEED",
+ [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
+ [EXIT_REASON_XSAVES] = "XSAVES",
+ [EXIT_REASON_XRSTORS] = "XRSTORS"
+};
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
+
+char *vmname;
+
+int guest_ncpus;
+uint16_t cores, maxcpus, sockets, threads;
+
+char *guest_uuid_str;
+
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
+static int virtio_msix = 1;
+static int x2apic_mode = 0; /* default is xAPIC */
+
+static int strictio;
+static int strictmsr = 1;
+
+static int acpi;
+
+static char *progname;
+static const int BSP = 0;
+
+#ifndef __FreeBSD__
+int bcons_wait = 0;
+int bcons_connected = 0;
+pthread_mutex_t bcons_wait_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t bcons_wait_done = PTHREAD_COND_INITIALIZER;
+#endif
+
+static cpuset_t cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+static struct vm_exit vmexit[VM_MAXCPU];
+
+struct bhyvestats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_reqidle;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t vmexit_inst_emul;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+#ifdef __FreeBSD__
+static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
+#endif
+
+static void
+usage(int code)
+{
+
+ fprintf(stderr,
+ "Usage: %s [-abehuwxACHPSWY]\n"
+ " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
+ " %*s [-g <gdb port>] [-l <lpc>]\n"
+#ifdef __FreeBSD__
+ " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
+#else
+ " %*s [-m mem] [-s <pci>] [-U uuid] <vm>\n"
+#endif
+ " -a: local apic is in xAPIC mode (deprecated)\n"
+ " -A: create ACPI tables\n"
+ " -c: number of cpus and/or topology specification\n"
+ " -C: include guest memory in core file\n"
+#ifndef __FreeBSD__
+ " -d: suspend cpu at boot\n"
+#endif
+ " -e: exit on unhandled I/O access\n"
+ " -g: gdb port\n"
+ " -h: help\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -l: LPC device configuration\n"
+ " -m: memory size\n"
+#ifdef __FreeBSD__
+ " -p: pin 'vcpu' to 'hostcpu'\n"
+#endif
+ " -P: vmexit from the guest on pause\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -S: guest memory cannot be swapped\n"
+ " -u: RTC keeps UTC time\n"
+ " -U: uuid\n"
+ " -w: ignore unimplemented MSRs\n"
+ " -W: force virtio to use single-vector MSI\n"
+ " -x: local apic is in x2APIC mode\n"
+ " -Y: disable MPtable generation\n",
+ progname, (int)strlen(progname), "", (int)strlen(progname), "",
+ (int)strlen(progname), "");
+
+ exit(code);
+}
+
+/*
+ * XXX This parser is known to have the following issues:
+ * 1. It accepts null key=value tokens ",,".
+ * 2. It accepts whitespace after = and before value.
+ * 3. Values out of range of INT are silently wrapped.
+ * 4. It doesn't check non-final values.
+ * 5. The apparently bogus limits of UINT16_MAX are for future expansion.
+ *
+ * The acceptance of a null specification ('-c ""') is by design to match the
+ * manual page syntax specification, this results in a topology of 1 vCPU.
+ */
+static int
+topology_parse(const char *opt)
+{
+ uint64_t ncpus;
+ int c, chk, n, s, t, tmp;
+ char *cp, *str;
+ bool ns, scts;
+
+ c = 1, n = 1, s = 1, t = 1;
+ ns = false, scts = false;
+ str = strdup(opt);
+ if (str == NULL)
+ goto out;
+
+ while ((cp = strsep(&str, ",")) != NULL) {
+ if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
+ n = tmp;
+ ns = true;
+ } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
+ n = tmp;
+ ns = true;
+ } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
+ s = tmp;
+ scts = true;
+ } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
+ c = tmp;
+ scts = true;
+ } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
+ t = tmp;
+ scts = true;
+#ifdef notyet /* Do not expose this until vmm.ko implements it */
+ } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
+ m = tmp;
+#endif
+ /* Skip the empty argument case from -c "" */
+ } else if (cp[0] == '\0')
+ continue;
+ else
+ goto out;
+ /* Any trailing garbage causes an error */
+ if (cp[chk] != '\0')
+ goto out;
+ }
+ free(str);
+ str = NULL;
+
+ /*
+ * Range check 1 <= n <= UINT16_MAX all values
+ */
+ if (n < 1 || s < 1 || c < 1 || t < 1 ||
+ n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX ||
+ t > UINT16_MAX)
+ return (-1);
+
+ /* If only the cpus was specified, use that as sockets */
+ if (!scts)
+ s = n;
+ /*
+ * Compute sockets * cores * threads avoiding overflow
+ * The range check above insures these are 16 bit values
+ * If n was specified check it against computed ncpus
+ */
+ ncpus = (uint64_t)s * c * t;
+ if (ncpus > UINT16_MAX || (ns && n != ncpus))
+ return (-1);
+
+ guest_ncpus = ncpus;
+ sockets = s;
+ cores = c;
+ threads = t;
+ return(0);
+
+out:
+ free(str);
+ return (-1);
+}
+
+#ifndef WITHOUT_CAPSICUM
+/*
+ * 11-stable capsicum helpers
+ */
+static void
+bhyve_caph_cache_catpages(void)
+{
+
+ (void)catopen("libc", NL_CAT_LOCALE);
+}
+
+static int
+bhyve_caph_limit_stdoe(void)
+{
+ cap_rights_t rights;
+ unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
+ int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
+
+ cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
+ cap_rights_set(&rights, CAP_WRITE);
+
+ for (i = 0; i < nitems(fds); i++) {
+ if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
+ return (-1);
+
+ if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
+ return (-1);
+
+ if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
+ return (-1);
+ }
+
+ return (0);
+}
+
+#endif
+
+#ifdef __FreeBSD__
+static int
+pincpu_parse(const char *opt)
+{
+ int vcpu, pcpu;
+
+ if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
+ fprintf(stderr, "invalid format: %s\n", opt);
+ return (-1);
+ }
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
+ vcpu, VM_MAXCPU - 1);
+ return (-1);
+ }
+
+ if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
+ fprintf(stderr, "hostcpu '%d' outside valid range from "
+ "0 to %d\n", pcpu, CPU_SETSIZE - 1);
+ return (-1);
+ }
+
+ if (vcpumap[vcpu] == NULL) {
+ if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
+ perror("malloc");
+ return (-1);
+ }
+ CPU_ZERO(vcpumap[vcpu]);
+ }
+ CPU_SET(pcpu, vcpumap[vcpu]);
+ return (0);
+}
+#endif
+
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+ int errcode)
+{
+ struct vmctx *ctx;
+ int error, restart_instruction;
+
+ ctx = arg;
+ restart_instruction = 1;
+
+ error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
+ restart_instruction);
+ assert(error == 0);
+}
+
+void *
+paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
+{
+
+ return (vm_map_gpa(ctx, gaddr, len));
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_virtio_msix(void)
+{
+
+ return (virtio_msix);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+ char tname[MAXCOMLEN + 1];
+ struct mt_vmm_info *mtp;
+ int vcpu;
+
+ mtp = param;
+ vcpu = mtp->mt_vcpu;
+
+ snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
+ pthread_set_name_np(mtp->mt_thr, tname);
+
+ gdb_cpu_add(vcpu);
+
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+#ifdef __FreeBSD__
+void
+fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
+#else
+void
+fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
+ bool suspend)
+#endif
+{
+ int error;
+
+ assert(fromcpu == BSP);
+
+ /*
+ * The 'newcpu' must be activated in the context of 'fromcpu'. If
+ * vm_activate_cpu() is delayed until newcpu's pthread starts running
+ * then vmm.ko is out-of-sync with bhyve and this can create a race
+ * with vm_suspend().
+ */
+ error = vm_activate_cpu(ctx, newcpu);
+ if (error != 0)
+ err(EX_OSERR, "could not activate CPU %d", newcpu);
+
+ CPU_SET_ATOMIC(newcpu, &cpumask);
+
+#ifndef __FreeBSD__
+ if (suspend)
+ (void) vm_suspend_cpu(ctx, newcpu);
+#endif
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[newcpu].rip = rip;
+ vmexit[newcpu].inst_length = 0;
+
+ mt_vmm_info[newcpu].mt_ctx = ctx;
+ mt_vmm_info[newcpu].mt_vcpu = newcpu;
+
+ error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[newcpu]);
+ assert(error == 0);
+}
+
+static int
+fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
+{
+
+ if (!CPU_ISSET(vcpu, &cpumask)) {
+ fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
+ exit(4);
+ }
+
+ CPU_CLR_ATOMIC(vcpu, &cpumask);
+ return (CPU_EMPTY(&cpumask));
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if BHYVE_DEBUG
+ /*
+ * put guest-driven debug here
+ */
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT) {
+ error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
+ return (error);
+ }
+
+ error = emulate_inout(ctx, vcpu, vme, strictio);
+ if (error) {
+ fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
+ port, vmexit->rip);
+ return (VMEXIT_ABORT);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ uint64_t val;
+ uint32_t eax, edx;
+ int error;
+
+ val = 0;
+ error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
+ if (error != 0) {
+ fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
+ vme->u.msr.code, *pvcpu);
+ if (strictmsr) {
+ vm_inject_gp(ctx, *pvcpu);
+ return (VMEXIT_CONTINUE);
+ }
+ }
+
+ eax = val;
+ error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
+ assert(error == 0);
+
+ edx = val >> 32;
+ error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
+ assert(error == 0);
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+
+ error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
+ if (error != 0) {
+ fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
+ vme->u.msr.code, vme->u.msr.wval, *pvcpu);
+ if (strictmsr) {
+ vm_inject_gp(ctx, *pvcpu);
+ return (VMEXIT_CONTINUE);
+ }
+ }
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+
+ (void)spinup_ap(ctx, *pvcpu,
+ vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+ return (VMEXIT_CONTINUE);
+}
+
+#define DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
+static const char *
+vmexit_vmx_desc(uint32_t exit_reason)
+{
+
+ if (exit_reason >= nitems(vmx_exit_reason_desc) ||
+ vmx_exit_reason_desc[exit_reason] == NULL)
+ return ("Unknown");
+ return (vmx_exit_reason_desc[exit_reason]);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+ fprintf(stderr, "\treason\t\tVMX\n");
+ fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+ fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+ fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
+ fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
+ vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
+ fprintf(stderr, "\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
+ fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+#ifdef DEBUG_EPT_MISCONFIG
+ if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ vm_get_register(ctx, *pvcpu,
+ VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+ &ept_misconfig_gpa);
+ vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+ &ept_misconfig_ptenum);
+ fprintf(stderr, "\tEPT misconfiguration:\n");
+ fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+ fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+ ept_misconfig_ptenum, ept_misconfig_pte[0],
+ ept_misconfig_pte[1], ept_misconfig_pte[2],
+ ept_misconfig_pte[3]);
+ }
+#endif /* DEBUG_EPT_MISCONFIG */
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+ fprintf(stderr, "\treason\t\tSVM\n");
+ fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+ fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+ fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
+ fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
+ fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ assert(vmexit->inst_length == 0);
+
+ stats.vmexit_bogus++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ assert(vmexit->inst_length == 0);
+
+ stats.vmexit_reqidle++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_hlt++;
+
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_pause++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ assert(vmexit->inst_length == 0);
+
+ stats.vmexit_mtrap++;
+
+ gdb_cpu_mtrap(*pvcpu);
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ int err, i;
+ struct vie *vie;
+
+ stats.vmexit_inst_emul++;
+
+ vie = &vmexit->u.inst_emul.vie;
+ err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+ vie, &vmexit->u.inst_emul.paging);
+
+ if (err) {
+ if (err == ESRCH) {
+ fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+ vmexit->u.inst_emul.gpa);
+ }
+
+ fprintf(stderr, "Failed to emulate instruction [");
+ for (i = 0; i < vie->num_valid; i++) {
+ fprintf(stderr, "0x%02x%s", vie->inst[i],
+ i != (vie->num_valid - 1) ? " " : "");
+ }
+ fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
+static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
+
+static int
+vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ enum vm_suspend_how how;
+
+ how = vmexit->u.suspended.how;
+
+ fbsdrun_deletecpu(ctx, *pvcpu);
+
+ if (*pvcpu != BSP) {
+ pthread_mutex_lock(&resetcpu_mtx);
+ pthread_cond_signal(&resetcpu_cond);
+ pthread_mutex_unlock(&resetcpu_mtx);
+ pthread_exit(NULL);
+ }
+
+ pthread_mutex_lock(&resetcpu_mtx);
+ while (!CPU_EMPTY(&cpumask)) {
+ pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
+ }
+ pthread_mutex_unlock(&resetcpu_mtx);
+
+ switch (how) {
+ case VM_SUSPEND_RESET:
+ exit(0);
+ case VM_SUSPEND_POWEROFF:
+ exit(1);
+ case VM_SUSPEND_HALT:
+ exit(2);
+ case VM_SUSPEND_TRIPLEFAULT:
+ exit(3);
+ default:
+ fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
+ exit(100);
+ }
+ return (0); /* NOTREACHED */
+}
+
+static int
+vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ gdb_cpu_suspend(*pvcpu);
+ return (VMEXIT_CONTINUE);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_INOUT_STR] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_SVM] = vmexit_svm,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_REQIDLE] = vmexit_reqidle,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
+ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+ [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+ [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
+ [VM_EXITCODE_DEBUG] = vmexit_debug,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
+{
+ int error, rc;
+ enum vm_exitcode exitcode;
+ cpuset_t active_cpus;
+
+#ifdef __FreeBSD__
+ if (vcpumap[vcpu] != NULL) {
+ error = pthread_setaffinity_np(pthread_self(),
+ sizeof(cpuset_t), vcpumap[vcpu]);
+ assert(error == 0);
+ }
+#endif
+ error = vm_active_cpus(ctx, &active_cpus);
+ assert(CPU_ISSET(vcpu, &active_cpus));
+
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
+ assert(error == 0);
+
+ while (1) {
+ error = vm_run(ctx, vcpu, &vmexit[vcpu]);
+ if (error != 0)
+ break;
+
+ exitcode = vmexit[vcpu].exitcode;
+ if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
+ fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
+ exitcode);
+ exit(4);
+ }
+
+ rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
+
+ switch (rc) {
+ case VMEXIT_CONTINUE:
+ break;
+ case VMEXIT_ABORT:
+ abort();
+ default:
+ exit(4);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+ int tmp, error;
+
+ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+ /*
+ * The guest is allowed to spinup more than one processor only if the
+ * UNRESTRICTED_GUEST capability is available.
+ */
+ if (error == 0)
+ return (VM_MAXCPU);
+ else
+ return (1);
+}
+
+void
+fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
+{
+ int err, tmp;
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr, "VM exit on HLT not supported\n");
+ exit(4);
+ }
+ vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
+ if (cpu == BSP)
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr,
+ "SMP mux requested, no pause support\n");
+ exit(4);
+ }
+ vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
+ if (cpu == BSP)
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (x2apic_mode)
+ err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
+ else
+ err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
+
+ if (err) {
+ fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+ exit(4);
+ }
+
+#ifdef __FreeBSD__
+ vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
+#endif
+}
+
+static struct vmctx *
+do_open(const char *vmname)
+{
+ struct vmctx *ctx;
+ int error;
+ bool reinit, romboot;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ const cap_ioctl_t *cmds;
+ size_t ncmds;
+#endif
+
+ reinit = romboot = false;
+
+ if (lpc_bootrom())
+ romboot = true;
+
+ error = vm_create(vmname);
+ if (error) {
+ if (errno == EEXIST) {
+ if (romboot) {
+ reinit = true;
+ } else {
+ /*
+ * The virtual machine has been setup by the
+ * userspace bootloader.
+ */
+ }
+ } else {
+ perror("vm_create");
+ exit(4);
+ }
+ } else {
+ if (!romboot) {
+ /*
+ * If the virtual machine was just created then a
+ * bootrom must be configured to boot it.
+ */
+ fprintf(stderr, "virtual machine cannot be booted\n");
+ exit(4);
+ }
+ }
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(4);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
+ if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ vm_get_ioctls(&ncmds);
+ cmds = vm_get_ioctls(NULL);
+ if (cmds == NULL)
+ errx(EX_OSERR, "out of memory");
+ if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ free((cap_ioctl_t *)cmds);
+#endif
+
+ if (reinit) {
+ error = vm_reinit(ctx);
+ if (error) {
+ perror("vm_reinit");
+ exit(4);
+ }
+ }
+ error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
+ if (error)
+ errx(EX_OSERR, "vm_set_topology");
+ return (ctx);
+}
+
+#ifndef __FreeBSD__
+
+#define FILE_PROVISIONING "/var/svc/provisioning"
+#define FILE_PROVISION_SUCCESS "/var/svc/provision_success"
+
+static void
+mark_provisioned(void)
+{
+ struct stat stbuf;
+
+ if (lstat(FILE_PROVISIONING, &stbuf) != 0)
+ return;
+
+ if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) {
+ (void) fprintf(stderr, "Cannot rename %s to %s: %s\n",
+ FILE_PROVISIONING, FILE_PROVISION_SUCCESS,
+ strerror(errno));
+ }
+}
+
+#endif
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, dbg_port, gdb_port, err, bvmcons;
+ int max_vcpus, mptgen, memflags;
+ int rtc_localtime;
+ bool gdb_stop;
+#ifndef __FreeBSD__
+ bool suspend = false;
+#endif
+ struct vmctx *ctx;
+ uint64_t rip;
+ size_t memsize;
+ char *optstr;
+
+ bvmcons = 0;
+ progname = basename(argv[0]);
+ dbg_port = 0;
+ gdb_port = 0;
+ gdb_stop = false;
+ guest_ncpus = 1;
+ sockets = cores = threads = 1;
+ maxcpus = 0;
+ memsize = 256 * MB;
+ mptgen = 1;
+ rtc_localtime = 1;
+ memflags = 0;
+
+#ifdef __FreeBSD__
+ optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:";
+#else
+ optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:";
+#endif
+ while ((c = getopt(argc, argv, optstr)) != -1) {
+ switch (c) {
+ case 'a':
+ x2apic_mode = 0;
+ break;
+ case 'A':
+ acpi = 1;
+ break;
+ case 'b':
+ bvmcons = 1;
+ break;
+ case 'B':
+ if (smbios_parse(optarg) != 0) {
+ errx(EX_USAGE, "invalid SMBIOS "
+ "configuration '%s'", optarg);
+ }
+ break;
+#ifndef __FreeBSD__
+ case 'd':
+ suspend = true;
+ break;
+#else
+ case 'p':
+ if (pincpu_parse(optarg) != 0) {
+ errx(EX_USAGE, "invalid vcpu pinning "
+ "configuration '%s'", optarg);
+ }
+ break;
+#endif
+ case 'c':
+ if (topology_parse(optarg) != 0) {
+ errx(EX_USAGE, "invalid cpu topology "
+ "'%s'", optarg);
+ }
+ break;
+ case 'C':
+ memflags |= VM_MEM_F_INCORE;
+ break;
+ case 'g':
+ dbg_port = atoi(optarg);
+ break;
+ case 'G':
+ if (optarg[0] == 'w') {
+ gdb_stop = true;
+ optarg++;
+ }
+ gdb_port = atoi(optarg);
+ break;
+ case 'l':
+ if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+ lpc_print_supported_devices();
+ exit(0);
+ } else if (lpc_device_parse(optarg) != 0) {
+ errx(EX_USAGE, "invalid lpc device "
+ "configuration '%s'", optarg);
+ }
+ break;
+ case 's':
+ if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+ pci_print_supported_devices();
+ exit(0);
+ } else if (pci_parse_slot(optarg) != 0)
+ exit(4);
+ else
+ break;
+ case 'S':
+ memflags |= VM_MEM_F_WIRED;
+ break;
+ case 'm':
+ error = vm_parse_memsize(optarg, &memsize);
+ if (error)
+ errx(EX_USAGE, "invalid memsize '%s'", optarg);
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'I':
+ /*
+ * The "-I" option was used to add an ioapic to the
+ * virtual machine.
+ *
+ * An ioapic is now provided unconditionally for each
+ * virtual machine and this option is now deprecated.
+ */
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'e':
+ strictio = 1;
+ break;
+ case 'u':
+ rtc_localtime = 0;
+ break;
+ case 'U':
+ guest_uuid_str = optarg;
+ break;
+ case 'w':
+ strictmsr = 0;
+ break;
+ case 'W':
+ virtio_msix = 0;
+ break;
+ case 'x':
+ x2apic_mode = 1;
+ break;
+ case 'Y':
+ mptgen = 0;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ vmname = argv[0];
+ ctx = do_open(vmname);
+
+ max_vcpus = num_vcpus_allowed(ctx);
+ if (guest_ncpus > max_vcpus) {
+ fprintf(stderr, "%d vCPUs requested but only %d available\n",
+ guest_ncpus, max_vcpus);
+ exit(4);
+ }
+
+ fbsdrun_set_capabilities(ctx, BSP);
+
+ vm_set_memflags(ctx, memflags);
+#ifdef __FreeBSD__
+ err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+#else
+ do {
+ errno = 0;
+ err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+ error = errno;
+ if (err != 0 && error == ENOMEM) {
+ (void) fprintf(stderr, "Unable to allocate memory "
+ "(%llu), retrying in 1 second\n", memsize);
+ sleep(1);
+ }
+ } while (error == ENOMEM);
+#endif
+ if (err) {
+ fprintf(stderr, "Unable to setup memory (%d)\n", errno);
+ exit(4);
+ }
+
+ error = init_msr();
+ if (error) {
+ fprintf(stderr, "init_msr error %d", error);
+ exit(4);
+ }
+
+ init_mem();
+ init_inout();
+ atkbdc_init(ctx);
+ pci_irq_init(ctx);
+ ioapic_init(ctx);
+
+ rtc_init(ctx, rtc_localtime);
+ sci_init(ctx);
+
+ /*
+ * Exit if a device emulation finds an error in its initilization
+ */
+ if (init_pci(ctx) != 0) {
+ perror("device emulation initialization error");
+ exit(4);
+ }
+
+ if (dbg_port != 0)
+ init_dbgport(dbg_port);
+
+ if (gdb_port != 0)
+ init_gdb(ctx, gdb_port, gdb_stop);
+
+ if (bvmcons)
+ init_bvmcons();
+
+ vga_init(1);
+
+ if (lpc_bootrom()) {
+ if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
+ fprintf(stderr, "ROM boot failed: unrestricted guest "
+ "capability not available\n");
+ exit(4);
+ }
+ error = vcpu_reset(ctx, BSP);
+ assert(error == 0);
+ }
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ if (mptgen) {
+ error = mptable_build(ctx, guest_ncpus);
+ if (error) {
+ perror("error to build the guest tables");
+ exit(4);
+ }
+ }
+
+ error = smbios_build(ctx);
+ assert(error == 0);
+
+ if (acpi) {
+ error = acpi_build(ctx, guest_ncpus);
+ assert(error == 0);
+ }
+
+ if (lpc_bootrom())
+ fwctl_init();
+
+ /*
+ * Change the proc title to include the VM name.
+ */
+ setproctitle("%s", vmname);
+
+#ifndef WITHOUT_CAPSICUM
+ caph_cache_catpages();
+
+ if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+
+ if (caph_enter() == -1)
+ errx(EX_OSERR, "cap_enter() failed");
+#endif
+
+#ifndef __FreeBSD__
+ /*
+ * If applicable, wait for bhyveconsole
+ */
+ if (bcons_wait) {
+ printf("Waiting for bhyveconsole connection...\n");
+ (void) pthread_mutex_lock(&bcons_wait_lock);
+ while (!bcons_connected) {
+ (void) pthread_cond_wait(&bcons_wait_done,
+ &bcons_wait_lock);
+ }
+ (void) pthread_mutex_unlock(&bcons_wait_lock);
+ }
+#endif
+
+ /*
+ * Add CPU 0
+ */
+#ifdef __FreeBSD__
+ fbsdrun_addcpu(ctx, BSP, BSP, rip);
+#else
+ fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend);
+
+ mark_provisioned();
+#endif
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+ mevent_dispatch();
+
+ exit(4);
+}
diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h
new file mode 100644
index 0000000000..cdde04862c
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.h
@@ -0,0 +1,74 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#define VMEXIT_CONTINUE (0)
+#define VMEXIT_ABORT (-1)
+
+struct vmctx;
+extern int guest_ncpus;
+extern char *guest_uuid_str;
+extern char *vmname;
+#ifndef __FreeBSD__
+extern int bcons_wait;
+extern int bcons_connected;
+extern pthread_mutex_t bcons_wait_lock;
+extern pthread_cond_t bcons_wait_done;
+#endif
+
+void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+
+void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
+#ifdef __FreeBSD__
+void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
+#else
+void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
+ bool suspend);
+#endif
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+int fbsdrun_disable_x2apic(void);
+int fbsdrun_virtio_msix(void);
+#endif
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c
new file mode 100644
index 0000000000..fcb4149b62
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.c
@@ -0,0 +1,1028 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+#include <sys/limits.h>
+#include <sys/uio.h>
+#ifndef __FreeBSD__
+#include <sys/dkio.h>
+#endif
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <machine/atomic.h>
+
+#include "bhyverun.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "block_if.h"
+
+#define BLOCKIF_SIG 0xb109b109
+
+#ifdef __FreeBSD__
+#define BLOCKIF_NUMTHR 8
+#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR)
+#else
+/* Enlarge to keep pace with the virtio-block ring size */
+#define BLOCKIF_NUMTHR 16
+#define BLOCKIF_MAXREQ (128 + BLOCKIF_NUMTHR)
+#endif
+
+enum blockop {
+ BOP_READ,
+ BOP_WRITE,
+#ifndef __FreeBSD__
+ BOP_WRITE_SYNC,
+#endif
+ BOP_FLUSH,
+ BOP_DELETE
+};
+
+enum blockstat {
+ BST_FREE,
+ BST_BLOCK,
+ BST_PEND,
+ BST_BUSY,
+ BST_DONE
+};
+
+struct blockif_elem {
+ TAILQ_ENTRY(blockif_elem) be_link;
+ struct blockif_req *be_req;
+ enum blockop be_op;
+ enum blockstat be_status;
+ pthread_t be_tid;
+ off_t be_block;
+};
+
+#ifndef __FreeBSD__
+enum blockif_wce {
+ WCE_NONE = 0,
+ WCE_IOCTL,
+ WCE_FCNTL
+};
+#endif
+
+struct blockif_ctxt {
+ int bc_magic;
+ int bc_fd;
+ int bc_ischr;
+ int bc_isgeom;
+ int bc_candelete;
+#ifndef __FreeBSD__
+ enum blockif_wce bc_wce;
+#endif
+ int bc_rdonly;
+ off_t bc_size;
+ int bc_sectsz;
+ int bc_psectsz;
+ int bc_psectoff;
+ int bc_closing;
+ pthread_t bc_btid[BLOCKIF_NUMTHR];
+ pthread_mutex_t bc_mtx;
+ pthread_cond_t bc_cond;
+
+ /* Request elements and free/pending/busy queues */
+ TAILQ_HEAD(, blockif_elem) bc_freeq;
+ TAILQ_HEAD(, blockif_elem) bc_pendq;
+ TAILQ_HEAD(, blockif_elem) bc_busyq;
+ struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
+};
+
+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
+
+struct blockif_sig_elem {
+ pthread_mutex_t bse_mtx;
+ pthread_cond_t bse_cond;
+ int bse_pending;
+ struct blockif_sig_elem *bse_next;
+};
+
+static struct blockif_sig_elem *blockif_bse_head;
+
+static int
+blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ struct blockif_elem *be, *tbe;
+ off_t off;
+ int i;
+
+ be = TAILQ_FIRST(&bc->bc_freeq);
+ assert(be != NULL);
+ assert(be->be_status == BST_FREE);
+ TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
+ be->be_req = breq;
+ be->be_op = op;
+ switch (op) {
+ case BOP_READ:
+ case BOP_WRITE:
+#ifndef __FreeBSD__
+ case BOP_WRITE_SYNC:
+#endif
+ case BOP_DELETE:
+ off = breq->br_offset;
+ for (i = 0; i < breq->br_iovcnt; i++)
+ off += breq->br_iov[i].iov_len;
+ break;
+ default:
+ off = OFF_MAX;
+ }
+ be->be_block = off;
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ if (tbe == NULL) {
+ TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ }
+ if (tbe == NULL)
+ be->be_status = BST_PEND;
+ else
+ be->be_status = BST_BLOCK;
+ TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
+ return (be->be_status == BST_PEND);
+}
+
+static int
+blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
+{
+ struct blockif_elem *be;
+
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_status == BST_PEND)
+ break;
+ assert(be->be_status == BST_BLOCK);
+ }
+ if (be == NULL)
+ return (0);
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ be->be_status = BST_BUSY;
+ be->be_tid = t;
+ TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
+ *bep = be;
+ return (1);
+}
+
+static void
+blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+ struct blockif_elem *tbe;
+
+ if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
+ TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+ else
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_req->br_offset == be->be_block)
+ tbe->be_status = BST_PEND;
+ }
+ be->be_tid = 0;
+ be->be_status = BST_FREE;
+ be->be_req = NULL;
+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+}
+
+static void
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
+{
+ struct blockif_req *br;
+#ifdef __FreeBSD__
+ off_t arg[2];
+#endif
+ ssize_t clen, len, off, boff, voff;
+ int i, err;
+
+ br = be->be_req;
+ if (br->br_iovcnt <= 1)
+ buf = NULL;
+ err = 0;
+ switch (be->be_op) {
+ case BOP_READ:
+ if (buf == NULL) {
+ if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= len;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ if (pread(bc->bc_fd, buf, len, br->br_offset +
+ off) < 0) {
+ err = errno;
+ break;
+ }
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy(br->br_iov[i].iov_base + voff,
+ buf + boff, clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+ off += len;
+ br->br_resid -= len;
+ }
+ break;
+ case BOP_WRITE:
+ if (bc->bc_rdonly) {
+ err = EROFS;
+ break;
+ }
+ if (buf == NULL) {
+ if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= len;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy(buf + boff,
+ br->br_iov[i].iov_base + voff, clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+ if (pwrite(bc->bc_fd, buf, len, br->br_offset +
+ off) < 0) {
+ err = errno;
+ break;
+ }
+ off += len;
+ br->br_resid -= len;
+ }
+ break;
+ case BOP_FLUSH:
+#ifdef __FreeBSD__
+ if (bc->bc_ischr) {
+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
+ err = errno;
+ } else if (fsync(bc->bc_fd))
+ err = errno;
+#else
+ /*
+ * This fsync() should be adequate to flush the cache of a file
+ * or device. In VFS, the VOP_SYNC operation is converted to
+ * the appropriate ioctl in both sdev (for real devices) and
+ * zfs (for zvols).
+ */
+ if (fsync(bc->bc_fd))
+ err = errno;
+#endif
+ break;
+ case BOP_DELETE:
+ if (!bc->bc_candelete)
+ err = EOPNOTSUPP;
+ else if (bc->bc_rdonly)
+ err = EROFS;
+#ifdef __FreeBSD__
+ else if (bc->bc_ischr) {
+ arg[0] = br->br_offset;
+ arg[1] = br->br_resid;
+ if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
+ err = errno;
+ else
+ br->br_resid = 0;
+ }
+#endif
+ else
+ err = EOPNOTSUPP;
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ be->be_status = BST_DONE;
+
+ (*br->br_callback)(br, err);
+}
+
+static void *
+blockif_thr(void *arg)
+{
+ struct blockif_ctxt *bc;
+ struct blockif_elem *be;
+ pthread_t t;
+ uint8_t *buf;
+
+ bc = arg;
+ if (bc->bc_isgeom)
+ buf = malloc(MAXPHYS);
+ else
+ buf = NULL;
+ t = pthread_self();
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ for (;;) {
+ while (blockif_dequeue(bc, t, &be)) {
+ pthread_mutex_unlock(&bc->bc_mtx);
+ blockif_proc(bc, be, buf);
+ pthread_mutex_lock(&bc->bc_mtx);
+ blockif_complete(bc, be);
+ }
+ /* Check ctxt status here to see if exit requested */
+ if (bc->bc_closing)
+ break;
+ pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ if (buf)
+ free(buf);
+ pthread_exit(NULL);
+ return (NULL);
+}
+
+#ifdef __FreeBSD__
+static void
+blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
+#else
+static void
+blockif_sigcont_handler(int signal)
+#endif
+{
+ struct blockif_sig_elem *bse;
+
+ for (;;) {
+ /*
+ * Process the entire list even if not intended for
+ * this thread.
+ */
+ do {
+ bse = blockif_bse_head;
+ if (bse == NULL)
+ return;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)bse,
+ (uintptr_t)bse->bse_next));
+
+ pthread_mutex_lock(&bse->bse_mtx);
+ bse->bse_pending = 0;
+ pthread_cond_signal(&bse->bse_cond);
+ pthread_mutex_unlock(&bse->bse_mtx);
+ }
+}
+
+static void
+blockif_init(void)
+{
+#ifdef __FreeBSD__
+ mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
+ (void) signal(SIGCONT, SIG_IGN);
+#else
+ (void) sigset(SIGCONT, blockif_sigcont_handler);
+#endif
+}
+
+struct blockif_ctxt *
+blockif_open(const char *optstr, const char *ident)
+{
+ char tname[MAXCOMLEN + 1];
+#ifdef __FreeBSD__
+ char name[MAXPATHLEN];
+ char *nopt, *xopts, *cp;
+#else
+ char *nopt, *xopts, *cp = NULL;
+#endif
+ struct blockif_ctxt *bc;
+ struct stat sbuf;
+#ifdef __FreeBSD__
+ struct diocgattr_arg arg;
+#else
+ enum blockif_wce wce = WCE_NONE;
+#endif
+ off_t size, psectsz, psectoff;
+ int extra, fd, i, sectsz;
+ int nocache, sync, ro, candelete, geom, ssopt, pssopt;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
+#endif
+
+ pthread_once(&blockif_once, blockif_init);
+
+ fd = -1;
+ ssopt = 0;
+ nocache = 0;
+ sync = 0;
+ ro = 0;
+
+ /*
+ * The first element in the optstring is always a pathname.
+ * Optional elements follow
+ */
+ nopt = xopts = strdup(optstr);
+ while (xopts != NULL) {
+ cp = strsep(&xopts, ",");
+ if (cp == nopt) /* file or device pathname */
+ continue;
+ else if (!strcmp(cp, "nocache"))
+ nocache = 1;
+ else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
+ sync = 1;
+ else if (!strcmp(cp, "ro"))
+ ro = 1;
+ else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
+ ;
+ else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
+ pssopt = ssopt;
+ else {
+ fprintf(stderr, "Invalid device option \"%s\"\n", cp);
+ goto err;
+ }
+ }
+
+ extra = 0;
+ if (nocache)
+ extra |= O_DIRECT;
+ if (sync)
+ extra |= O_SYNC;
+
+ fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
+ if (fd < 0 && !ro) {
+ /* Attempt a r/w fail with a r/o open */
+ fd = open(nopt, O_RDONLY | extra);
+ ro = 1;
+ }
+
+ if (fd < 0) {
+ warn("Could not open backing file: %s", nopt);
+ goto err;
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ warn("Could not stat backing file %s", nopt);
+ goto err;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
+ CAP_WRITE);
+ if (ro)
+ cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
+
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+ psectsz = psectoff = 0;
+ candelete = geom = 0;
+#ifdef __FreeBSD__
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ goto err;
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
+ ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
+ strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+ arg.len = sizeof(arg.value.i);
+ if (ioctl(fd, DIOCGATTR, &arg) == 0)
+ candelete = arg.value.i;
+ if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
+ geom = 1;
+ } else {
+ psectsz = sbuf.st_blksize;
+ }
+#else
+ psectsz = sbuf.st_blksize;
+ if (S_ISCHR(sbuf.st_mode)) {
+ struct dk_minfo_ext dkmext;
+ int wce_val;
+
+ /* Look for a more accurate physical blocksize */
+ if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
+ psectsz = dkmext.dki_pbsize;
+ }
+ /* See if a configurable write cache is present and working */
+ if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
+ /*
+ * If WCE is already active, disable it until the
+ * specific device driver calls for its return. If it
+ * is not active, toggle it on and off to verify that
+ * such actions are possible.
+ */
+ if (wce_val != 0) {
+ wce_val = 0;
+ /*
+ * Inability to disable the cache is a threat
+ * to data durability.
+ */
+ assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
+ wce = WCE_IOCTL;
+ } else {
+ int r1, r2;
+
+ wce_val = 1;
+ r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
+ wce_val = 0;
+ r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
+
+ if (r1 == 0 && r2 == 0) {
+ wce = WCE_IOCTL;
+ } else {
+ /*
+ * If the cache cache toggle was not
+ * successful, ensure that the cache
+ * was not left enabled.
+ */
+ assert(r1 != 0);
+ }
+ }
+ }
+ } else {
+ int flags;
+
+ if ((flags = fcntl(fd, F_GETFL)) >= 0) {
+ flags |= O_DSYNC;
+ if (fcntl(fd, F_SETFL, flags) != -1) {
+ wce = WCE_FCNTL;
+ }
+ }
+ }
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+ if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ if (ssopt != 0) {
+ if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
+ ssopt > pssopt) {
+ fprintf(stderr, "Invalid sector size %d/%d\n",
+ ssopt, pssopt);
+ goto err;
+ }
+
+ /*
+ * Some backend drivers (e.g. cd0, ada0) require that the I/O
+ * size be a multiple of the device's sector size.
+ *
+ * Validate that the emulated sector size complies with this
+ * requirement.
+ */
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ssopt < sectsz || (ssopt % sectsz) != 0) {
+ fprintf(stderr, "Sector size %d incompatible "
+ "with underlying device sector size %d\n",
+ ssopt, sectsz);
+ goto err;
+ }
+ }
+
+ sectsz = ssopt;
+ psectsz = pssopt;
+ psectoff = 0;
+ }
+
+ bc = calloc(1, sizeof(struct blockif_ctxt));
+ if (bc == NULL) {
+ perror("calloc");
+ goto err;
+ }
+
+ bc->bc_magic = BLOCKIF_SIG;
+ bc->bc_fd = fd;
+ bc->bc_ischr = S_ISCHR(sbuf.st_mode);
+ bc->bc_isgeom = geom;
+ bc->bc_candelete = candelete;
+#ifndef __FreeBSD__
+ bc->bc_wce = wce;
+#endif
+ bc->bc_rdonly = ro;
+ bc->bc_size = size;
+ bc->bc_sectsz = sectsz;
+ bc->bc_psectsz = psectsz;
+ bc->bc_psectoff = psectoff;
+ pthread_mutex_init(&bc->bc_mtx, NULL);
+ pthread_cond_init(&bc->bc_cond, NULL);
+ TAILQ_INIT(&bc->bc_freeq);
+ TAILQ_INIT(&bc->bc_pendq);
+ TAILQ_INIT(&bc->bc_busyq);
+ for (i = 0; i < BLOCKIF_MAXREQ; i++) {
+ bc->bc_reqs[i].be_status = BST_FREE;
+ TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
+ }
+
+ for (i = 0; i < BLOCKIF_NUMTHR; i++) {
+ pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
+ snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
+ pthread_set_name_np(bc->bc_btid[i], tname);
+ }
+
+ return (bc);
+err:
+ if (fd >= 0)
+ close(fd);
+#ifdef __FreeBSD__
+ free(cp);
+ free(xopts);
+ free(nopt);
+#else
+ free(nopt);
+#endif
+ return (NULL);
+}
+
+static int
+blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ int err;
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (!TAILQ_EMPTY(&bc->bc_freeq)) {
+ /*
+ * Enqueue and inform the block i/o thread
+ * that there is work available
+ */
+ if (blockif_enqueue(bc, breq, op))
+ pthread_cond_signal(&bc->bc_cond);
+ } else {
+ /*
+ * Callers are not allowed to enqueue more than
+ * the specified blockif queue limit. Return an
+ * error to indicate that the queue length has been
+ * exceeded.
+ */
+ err = E2BIG;
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+int
+blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_READ));
+}
+
+int
+blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_WRITE));
+}
+
+int
+blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_FLUSH));
+}
+
+int
+blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_DELETE));
+}
+
+int
+blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ struct blockif_elem *be;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ /*
+ * Check pending requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be != NULL) {
+ /*
+ * Found it.
+ */
+ blockif_complete(bc, be);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (0);
+ }
+
+ /*
+ * Check in-flight requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be == NULL) {
+ /*
+ * Didn't find it.
+ */
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (EINVAL);
+ }
+
+ /*
+ * Interrupt the processing thread to force it return
+ * prematurely via it's normal callback path.
+ */
+ while (be->be_status == BST_BUSY) {
+ struct blockif_sig_elem bse, *old_head;
+
+ pthread_mutex_init(&bse.bse_mtx, NULL);
+ pthread_cond_init(&bse.bse_cond, NULL);
+
+ bse.bse_pending = 1;
+
+ do {
+ old_head = blockif_bse_head;
+ bse.bse_next = old_head;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)old_head,
+ (uintptr_t)&bse));
+
+ pthread_kill(be->be_tid, SIGCONT);
+
+ pthread_mutex_lock(&bse.bse_mtx);
+ while (bse.bse_pending)
+ pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
+ pthread_mutex_unlock(&bse.bse_mtx);
+ }
+
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ /*
+ * The processing thread has been interrupted. Since it's not
+ * clear if the callback has been invoked yet, return EBUSY.
+ */
+ return (EBUSY);
+}
+
+int
+blockif_close(struct blockif_ctxt *bc)
+{
+ void *jval;
+ int i;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ /*
+ * Stop the block i/o thread
+ */
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_closing = 1;
+ pthread_mutex_unlock(&bc->bc_mtx);
+ pthread_cond_broadcast(&bc->bc_cond);
+ for (i = 0; i < BLOCKIF_NUMTHR; i++)
+ pthread_join(bc->bc_btid[i], &jval);
+
+ /* XXX Cancel queued i/o's ??? */
+
+ /*
+ * Release resources
+ */
+ bc->bc_magic = 0;
+ close(bc->bc_fd);
+ free(bc);
+
+ return (0);
+}
+
+/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+ off_t sectors; /* total sectors of the block dev */
+ off_t hcyl; /* cylinders times heads */
+ uint16_t secpt; /* sectors per track */
+ uint8_t heads;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ sectors = bc->bc_size / bc->bc_sectsz;
+
+ /* Clamp the size to the largest possible with CHS */
+ if (sectors > 65535UL*16*255)
+ sectors = 65535UL*16*255;
+
+ if (sectors >= 65536UL*16*63) {
+ secpt = 255;
+ heads = 16;
+ hcyl = sectors / secpt;
+ } else {
+ secpt = 17;
+ hcyl = sectors / secpt;
+ heads = (hcyl + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (hcyl >= (heads * 1024) || heads > 16) {
+ secpt = 31;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ if (hcyl >= (heads * 1024)) {
+ secpt = 63;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ }
+
+ *c = hcyl / heads;
+ *h = heads;
+ *s = secpt;
+}
+
+/*
+ * Accessors
+ */
+off_t
+blockif_size(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_size);
+}
+
+int
+blockif_sectsz(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_sectsz);
+}
+
+void
+blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ *size = bc->bc_psectsz;
+ *off = bc->bc_psectoff;
+}
+
+int
+blockif_queuesz(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (BLOCKIF_MAXREQ - 1);
+}
+
+int
+blockif_is_ro(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_rdonly);
+}
+
+int
+blockif_candelete(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_candelete);
+}
+
+#ifndef __FreeBSD__
+int
+blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
+{
+ int res = 0, flags;
+ int clean_val = (wc_enable != 0) ? 1 : 0;
+
+ (void) pthread_mutex_lock(&bc->bc_mtx);
+ switch (bc->bc_wce) {
+ case WCE_IOCTL:
+ res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
+ break;
+ case WCE_FCNTL:
+ if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
+ if (wc_enable == 0) {
+ flags |= O_DSYNC;
+ } else {
+ flags &= ~O_DSYNC;
+ }
+ if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
+ res = -1;
+ }
+ } else {
+ res = -1;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * After a successful disable of the write cache, ensure that any
+ * lingering data in the cache is synced out.
+ */
+ if (res == 0 && wc_enable == 0) {
+ res = fsync(bc->bc_fd);
+ }
+ (void) pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (res);
+}
+#endif /* __FreeBSD__ */
diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h
new file mode 100644
index 0000000000..8401cd9529
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.h
@@ -0,0 +1,84 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The block API to be used by bhyve block-device emulations. The routines
+ * are thread safe, with no assumptions about the context of the completion
+ * callback - it may occur in the caller's context, or asynchronously in
+ * another thread.
+ */
+
+#ifndef _BLOCK_IF_H_
+#define _BLOCK_IF_H_
+
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+#ifdef __FreeBSD__
+#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */
+#else
+/*
+ * Upstream is in the process of bumping this up to 128 for several reasons,
+ * including Windows compatibility. For the sake of our Windows support, we
+ * will use the higher value now.
+ */
+#define BLOCKIF_IOV_MAX 128
+#endif
+
+struct blockif_req {
+ int br_iovcnt;
+ off_t br_offset;
+ ssize_t br_resid;
+ void (*br_callback)(struct blockif_req *req, int err);
+ void *br_param;
+ struct iovec br_iov[BLOCKIF_IOV_MAX];
+};
+
+struct blockif_ctxt;
+struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
+off_t blockif_size(struct blockif_ctxt *bc);
+void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+ uint8_t *s);
+int blockif_sectsz(struct blockif_ctxt *bc);
+void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
+int blockif_queuesz(struct blockif_ctxt *bc);
+int blockif_is_ro(struct blockif_ctxt *bc);
+int blockif_candelete(struct blockif_ctxt *bc);
+#ifndef __FreeBSD__
+int blockif_set_wce(struct blockif_ctxt *bc, int enable);
+#endif
+int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_close(struct blockif_ctxt *bc);
+
+#endif /* _BLOCK_IF_H_ */
diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c
new file mode 100644
index 0000000000..b8c63828c8
--- /dev/null
+++ b/usr/src/cmd/bhyve/bootrom.c
@@ -0,0 +1,113 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <machine/vmm.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include <vmmapi.h>
+#include "bhyverun.h"
+#include "bootrom.h"
+
+#define MAX_BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */
+
+int
+bootrom_init(struct vmctx *ctx, const char *romfile)
+{
+ struct stat sbuf;
+ vm_paddr_t gpa;
+ ssize_t rlen;
+ char *ptr;
+ int fd, i, rv, prot;
+
+ rv = -1;
+ fd = open(romfile, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "Error opening bootrom \"%s\": %s\n",
+ romfile, strerror(errno));
+ goto done;
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n",
+ romfile, strerror(errno));
+ goto done;
+ }
+
+ /*
+ * Limit bootrom size to 16MB so it doesn't encroach into reserved
+ * MMIO space (e.g. APIC, HPET, MSI).
+ */
+ if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) {
+ fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size);
+ goto done;
+ }
+
+ if (sbuf.st_size & PAGE_MASK) {
+ fprintf(stderr, "Bootrom size %ld is not a multiple of the "
+ "page size\n", sbuf.st_size);
+ goto done;
+ }
+
+ ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size);
+ if (ptr == MAP_FAILED)
+ goto done;
+
+ /* Map the bootrom into the guest address space */
+ prot = PROT_READ | PROT_EXEC;
+ gpa = (1ULL << 32) - sbuf.st_size;
+ if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0)
+ goto done;
+
+ /* Read 'romfile' into the guest address space */
+ for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) {
+ rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE);
+ if (rlen != PAGE_SIZE) {
+ fprintf(stderr, "Incomplete read of page %d of bootrom "
+ "file %s: %ld bytes\n", i, romfile, rlen);
+ goto done;
+ }
+ }
+ rv = 0;
+done:
+ if (fd >= 0)
+ close(fd);
+ return (rv);
+}
diff --git a/usr/src/cmd/bhyve/bootrom.h b/usr/src/cmd/bhyve/bootrom.h
new file mode 100644
index 0000000000..7fb12181dd
--- /dev/null
+++ b/usr/src/cmd/bhyve/bootrom.h
@@ -0,0 +1,40 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BOOTROM_H_
+#define _BOOTROM_H_
+
+#include <stdbool.h>
+
+struct vmctx;
+
+int bootrom_init(struct vmctx *ctx, const char *romfile);
+
+#endif
diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c
new file mode 100644
index 0000000000..2567f69959
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.c
@@ -0,0 +1,120 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+
+static struct {
+ struct bhyvegc *gc;
+
+ fb_render_func_t fb_render_cb;
+ void *fb_arg;
+
+ kbd_event_func_t kbd_event_cb;
+ void *kbd_arg;
+ int kbd_priority;
+
+ ptr_event_func_t ptr_event_cb;
+ void *ptr_arg;
+ int ptr_priority;
+} console;
+
+void
+console_init(int w, int h, void *fbaddr)
+{
+ console.gc = bhyvegc_init(w, h, fbaddr);
+}
+
+void
+console_set_fbaddr(void *fbaddr)
+{
+ bhyvegc_set_fbaddr(console.gc, fbaddr);
+}
+
+struct bhyvegc_image *
+console_get_image(void)
+{
+ struct bhyvegc_image *bhyvegc_image;
+
+ bhyvegc_image = bhyvegc_get_image(console.gc);
+
+ return (bhyvegc_image);
+}
+
+void
+console_fb_register(fb_render_func_t render_cb, void *arg)
+{
+ console.fb_render_cb = render_cb;
+ console.fb_arg = arg;
+}
+
+void
+console_refresh(void)
+{
+ if (console.fb_render_cb)
+ (*console.fb_render_cb)(console.gc, console.fb_arg);
+}
+
+void
+console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri)
+{
+ if (pri > console.kbd_priority) {
+ console.kbd_event_cb = event_cb;
+ console.kbd_arg = arg;
+ console.kbd_priority = pri;
+ }
+}
+
+void
+console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri)
+{
+ if (pri > console.ptr_priority) {
+ console.ptr_event_cb = event_cb;
+ console.ptr_arg = arg;
+ console.ptr_priority = pri;
+ }
+}
+
+void
+console_key_event(int down, uint32_t keysym)
+{
+ if (console.kbd_event_cb)
+ (*console.kbd_event_cb)(down, keysym, console.kbd_arg);
+}
+
+void
+console_ptr_event(uint8_t button, int x, int y)
+{
+ if (console.ptr_event_cb)
+ (*console.ptr_event_cb)(button, x, y, console.ptr_arg);
+}
diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h
new file mode 100644
index 0000000000..0d0a854866
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.h
@@ -0,0 +1,55 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONSOLE_H_
+#define _CONSOLE_H_
+
+struct bhyvegc;
+
+typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg);
+typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg);
+typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg);
+
+void console_init(int w, int h, void *fbaddr);
+
+void console_set_fbaddr(void *fbaddr);
+
+struct bhyvegc_image *console_get_image(void);
+
+void console_fb_register(fb_render_func_t render_cb, void *arg);
+void console_refresh(void);
+
+void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri);
+void console_key_event(int down, uint32_t keysym);
+
+void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri);
+void console_ptr_event(uint8_t button, int x, int y);
+
+#endif /* _CONSOLE_H_ */
diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c
new file mode 100644
index 0000000000..cda2df2414
--- /dev/null
+++ b/usr/src/cmd/bhyve/consport.c
@@ -0,0 +1,180 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/select.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <sysexits.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+#define BVM_CONSOLE_PORT 0x220
+#define BVM_CONS_SIG ('b' << 8 | 'v')
+
+#ifdef __FreeBSD__
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+#endif
+
+static void
+ttyopen(void)
+{
+#ifdef __FreeBSD__
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+#endif
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
+#endif
+
+ if (bytes == 2 && in) {
+ *eax = BVM_CONS_SIG;
+ return (0);
+ }
+
+ /*
+ * Guests might probe this port to look for old ISA devices
+ * using single-byte reads. Return 0xff for those.
+ */
+ if (bytes == 1 && in) {
+ *eax = 0xff;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ,
+ CAP_WRITE);
+ if (caph_rights_limit(STDIN_FILENO, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+
+SYSRES_IO(BVM_CONSOLE_PORT, 4);
+
+static struct inout_port consport = {
+ "bvmcons",
+ BVM_CONSOLE_PORT,
+ 1,
+ IOPORT_F_INOUT,
+ console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+ register_inout(&consport);
+}
diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c
new file mode 100644
index 0000000000..88a616b50d
--- /dev/null
+++ b/usr/src/cmd/bhyve/dbgport.c
@@ -0,0 +1,180 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/uio.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sysexits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+#include "dbgport.h"
+#include "pci_lpc.h"
+
+#define BVM_DBG_PORT 0x224
+#define BVM_DBG_SIG ('B' << 8 | 'V')
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int nwritten, nread, printonce;
+ int on = 1;
+ char ch;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_DBG_SIG;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+again:
+ printonce = 0;
+ while (conn_fd < 0) {
+ if (!printonce) {
+ printf("Waiting for connection from gdb\r\n");
+ printonce = 1;
+ }
+ conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK);
+ if (conn_fd >= 0) {
+ /* Avoid EPIPE after the client drops off. */
+ (void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE,
+ &on, sizeof(on));
+ /* Improve latency for one byte at a time tranfers. */
+ (void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY,
+ &on, sizeof(on));
+ } else if (errno != EINTR) {
+ perror("accept");
+ }
+ }
+
+ if (in) {
+ nread = read(conn_fd, &ch, 1);
+ if (nread == -1 && errno == EAGAIN)
+ *eax = -1;
+ else if (nread == 1)
+ *eax = ch;
+ else {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ } else {
+ ch = *eax;
+ nwritten = write(conn_fd, &ch, 1);
+ if (nwritten != 1) {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ }
+ return (0);
+}
+
+static struct inout_port dbgport = {
+ "bvmdbg",
+ BVM_DBG_PORT,
+ 1,
+ IOPORT_F_INOUT,
+ dbg_handler
+};
+
+SYSRES_IO(BVM_DBG_PORT, 4);
+
+void
+init_dbgport(int sport)
+{
+ int reuse;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ conn_fd = -1;
+
+ if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("cannot create socket");
+ exit(4);
+ }
+
+#ifdef __FreeBSD__
+ sin.sin_len = sizeof(sin);
+#endif
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ reuse = 1;
+ if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse,
+ sizeof(reuse)) < 0) {
+ perror("cannot set socket options");
+ exit(4);
+ }
+
+ if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("cannot bind socket");
+ exit(4);
+ }
+
+ if (listen(listen_fd, 1) < 0) {
+ perror("cannot listen socket");
+ exit(4);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(listen_fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ register_inout(&dbgport);
+}
diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h
new file mode 100644
index 0000000000..407ff3ffbf
--- /dev/null
+++ b/usr/src/cmd/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c
new file mode 100644
index 0000000000..0640bc28ba
--- /dev/null
+++ b/usr/src/cmd/bhyve/fwctl.c
@@ -0,0 +1,552 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does,
+ * but with a request/response messaging protocol.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "fwctl.h"
+
+/*
+ * Messaging protocol base operations
+ */
+#define OP_NULL 1
+#define OP_ECHO 2
+#define OP_GET 3
+#define OP_GET_LEN 4
+#define OP_SET 5
+#define OP_MAX OP_SET
+
+/* I/O ports */
+#define FWCTL_OUT 0x510
+#define FWCTL_IN 0x511
+
+/*
+ * Back-end state-machine
+ */
+enum state {
+ DORMANT,
+ IDENT_WAIT,
+ IDENT_SEND,
+ REQ,
+ RESP
+} be_state = DORMANT;
+
+static uint8_t sig[] = { 'B', 'H', 'Y', 'V' };
+static u_int ident_idx;
+
+struct op_info {
+ int op;
+ int (*op_start)(uint32_t len);
+ void (*op_data)(uint32_t data, uint32_t len);
+ int (*op_result)(struct iovec **data);
+ void (*op_done)(struct iovec *data);
+};
+static struct op_info *ops[OP_MAX+1];
+
+/* Return 0-padded uint32_t */
+static uint32_t
+fwctl_send_rest(uint32_t *data, size_t len)
+{
+ union {
+ uint8_t c[4];
+ uint32_t w;
+ } u;
+ uint8_t *cdata;
+ int i;
+
+ cdata = (uint8_t *) data;
+ u.w = 0;
+
+ for (i = 0, u.w = 0; i < len; i++)
+ u.c[i] = *cdata++;
+
+ return (u.w);
+}
+
+/*
+ * error op dummy proto - drop all data sent and return an error
+*/
+static int errop_code;
+
+static void
+errop_set(int err)
+{
+
+ errop_code = err;
+}
+
+static int
+errop_start(uint32_t len)
+{
+ errop_code = ENOENT;
+
+ /* accept any length */
+ return (errop_code);
+}
+
+static void
+errop_data(uint32_t data, uint32_t len)
+{
+
+ /* ignore */
+}
+
+static int
+errop_result(struct iovec **data)
+{
+
+ /* no data to send back; always successful */
+ *data = NULL;
+ return (errop_code);
+}
+
+static void
+errop_done(struct iovec *data)
+{
+
+ /* assert data is NULL */
+}
+
+static struct op_info errop_info = {
+ .op_start = errop_start,
+ .op_data = errop_data,
+ .op_result = errop_result,
+ .op_done = errop_done
+};
+
+/* OID search */
+SET_DECLARE(ctl_set, struct ctl);
+
+CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus));
+
+static struct ctl *
+ctl_locate(const char *str, int maxlen)
+{
+ struct ctl *cp, **cpp;
+
+ SET_FOREACH(cpp, ctl_set) {
+ cp = *cpp;
+ if (!strncmp(str, cp->c_oid, maxlen))
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/* uefi-sysctl get-len */
+#define FGET_STRSZ 80
+static struct iovec fget_biov[2];
+static char fget_str[FGET_STRSZ];
+static struct {
+ size_t f_sz;
+ uint32_t f_data[1024];
+} fget_buf;
+static int fget_cnt;
+static size_t fget_size;
+
+static int
+fget_start(uint32_t len)
+{
+
+ if (len > FGET_STRSZ)
+ return(E2BIG);
+
+ fget_cnt = 0;
+
+ return (0);
+}
+
+static void
+fget_data(uint32_t data, uint32_t len)
+{
+
+ *((uint32_t *) &fget_str[fget_cnt]) = data;
+ fget_cnt += sizeof(uint32_t);
+}
+
+static int
+fget_result(struct iovec **data, int val)
+{
+ struct ctl *cp;
+ int err;
+
+ err = 0;
+
+ /* Locate the OID */
+ cp = ctl_locate(fget_str, fget_cnt);
+ if (cp == NULL) {
+ *data = NULL;
+ err = ENOENT;
+ } else {
+ if (val) {
+ /* For now, copy the len/data into a buffer */
+ memset(&fget_buf, 0, sizeof(fget_buf));
+ fget_buf.f_sz = cp->c_len;
+ memcpy(fget_buf.f_data, cp->c_data, cp->c_len);
+ fget_biov[0].iov_base = (char *)&fget_buf;
+ fget_biov[0].iov_len = sizeof(fget_buf.f_sz) +
+ cp->c_len;
+ } else {
+ fget_size = cp->c_len;
+ fget_biov[0].iov_base = (char *)&fget_size;
+ fget_biov[0].iov_len = sizeof(fget_size);
+ }
+
+ fget_biov[1].iov_base = NULL;
+ fget_biov[1].iov_len = 0;
+ *data = fget_biov;
+ }
+
+ return (err);
+}
+
+static void
+fget_done(struct iovec *data)
+{
+
+ /* nothing needs to be freed */
+}
+
+static int
+fget_len_result(struct iovec **data)
+{
+ return (fget_result(data, 0));
+}
+
+static int
+fget_val_result(struct iovec **data)
+{
+ return (fget_result(data, 1));
+}
+
+static struct op_info fgetlen_info = {
+ .op_start = fget_start,
+ .op_data = fget_data,
+ .op_result = fget_len_result,
+ .op_done = fget_done
+};
+
+static struct op_info fgetval_info = {
+ .op_start = fget_start,
+ .op_data = fget_data,
+ .op_result = fget_val_result,
+ .op_done = fget_done
+};
+
+static struct req_info {
+ int req_error;
+ u_int req_count;
+ uint32_t req_size;
+ uint32_t req_type;
+ uint32_t req_txid;
+ struct op_info *req_op;
+ int resp_error;
+ int resp_count;
+ size_t resp_size;
+ size_t resp_off;
+ struct iovec *resp_biov;
+} rinfo;
+
+static void
+fwctl_response_done(void)
+{
+
+ (*rinfo.req_op->op_done)(rinfo.resp_biov);
+
+ /* reinit the req data struct */
+ memset(&rinfo, 0, sizeof(rinfo));
+}
+
+static void
+fwctl_request_done(void)
+{
+
+ rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov);
+
+ /* XXX only a single vector supported at the moment */
+ rinfo.resp_off = 0;
+ if (rinfo.resp_biov == NULL) {
+ rinfo.resp_size = 0;
+ } else {
+ rinfo.resp_size = rinfo.resp_biov[0].iov_len;
+ }
+}
+
+static int
+fwctl_request_start(void)
+{
+ int err;
+
+ /* Data size doesn't include header */
+ rinfo.req_size -= 12;
+
+ rinfo.req_op = &errop_info;
+ if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL)
+ rinfo.req_op = ops[rinfo.req_type];
+
+ err = (*rinfo.req_op->op_start)(rinfo.req_size);
+
+ if (err) {
+ errop_set(err);
+ rinfo.req_op = &errop_info;
+ }
+
+ /* Catch case of zero-length message here */
+ if (rinfo.req_size == 0) {
+ fwctl_request_done();
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+fwctl_request_data(uint32_t value)
+{
+
+ /* Make sure remaining size is >= 0 */
+ if (rinfo.req_size <= sizeof(uint32_t))
+ rinfo.req_size = 0;
+ else
+ rinfo.req_size -= sizeof(uint32_t);
+
+ (*rinfo.req_op->op_data)(value, rinfo.req_size);
+
+ if (rinfo.req_size < sizeof(uint32_t)) {
+ fwctl_request_done();
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+fwctl_request(uint32_t value)
+{
+
+ int ret;
+
+ ret = 0;
+
+ switch (rinfo.req_count) {
+ case 0:
+ /* Verify size */
+ if (value < 12) {
+ printf("msg size error");
+ exit(4);
+ }
+ rinfo.req_size = value;
+ rinfo.req_count = 1;
+ break;
+ case 1:
+ rinfo.req_type = value;
+ rinfo.req_count++;
+ break;
+ case 2:
+ rinfo.req_txid = value;
+ rinfo.req_count++;
+ ret = fwctl_request_start();
+ break;
+ default:
+ ret = fwctl_request_data(value);
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+fwctl_response(uint32_t *retval)
+{
+ uint32_t *dp;
+ ssize_t remlen;
+
+ switch(rinfo.resp_count) {
+ case 0:
+ /* 4 x u32 header len + data */
+ *retval = 4*sizeof(uint32_t) +
+ roundup(rinfo.resp_size, sizeof(uint32_t));
+ rinfo.resp_count++;
+ break;
+ case 1:
+ *retval = rinfo.req_type;
+ rinfo.resp_count++;
+ break;
+ case 2:
+ *retval = rinfo.req_txid;
+ rinfo.resp_count++;
+ break;
+ case 3:
+ *retval = rinfo.resp_error;
+ rinfo.resp_count++;
+ break;
+ default:
+ remlen = rinfo.resp_size - rinfo.resp_off;
+ dp = (uint32_t *)
+ ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off);
+ if (remlen >= sizeof(uint32_t)) {
+ *retval = *dp;
+ } else if (remlen > 0) {
+ *retval = fwctl_send_rest(dp, remlen);
+ }
+ rinfo.resp_off += sizeof(uint32_t);
+ break;
+ }
+
+ if (rinfo.resp_count > 3 &&
+ rinfo.resp_off >= rinfo.resp_size) {
+ fwctl_response_done();
+ return (1);
+ }
+
+ return (0);
+}
+
+
+/*
+ * i/o port handling.
+ */
+static uint8_t
+fwctl_inb(void)
+{
+ uint8_t retval;
+
+ retval = 0xff;
+
+ switch (be_state) {
+ case IDENT_SEND:
+ retval = sig[ident_idx++];
+ if (ident_idx >= sizeof(sig))
+ be_state = REQ;
+ break;
+ default:
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+fwctl_outw(uint16_t val)
+{
+ switch (be_state) {
+ case IDENT_WAIT:
+ if (val == 0) {
+ be_state = IDENT_SEND;
+ ident_idx = 0;
+ }
+ break;
+ default:
+ /* ignore */
+ break;
+ }
+}
+
+static uint32_t
+fwctl_inl(void)
+{
+ uint32_t retval;
+
+ switch (be_state) {
+ case RESP:
+ if (fwctl_response(&retval))
+ be_state = REQ;
+ break;
+ default:
+ retval = 0xffffffff;
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+fwctl_outl(uint32_t val)
+{
+
+ switch (be_state) {
+ case REQ:
+ if (fwctl_request(val))
+ be_state = RESP;
+ default:
+ break;
+ }
+
+}
+
+static int
+fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (in) {
+ if (bytes == 1)
+ *eax = fwctl_inb();
+ else if (bytes == 4)
+ *eax = fwctl_inl();
+ else
+ *eax = 0xffff;
+ } else {
+ if (bytes == 2)
+ fwctl_outw(*eax);
+ else if (bytes == 4)
+ fwctl_outl(*eax);
+ }
+
+ return (0);
+}
+INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler);
+INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler);
+
+void
+fwctl_init(void)
+{
+
+ ops[OP_GET_LEN] = &fgetlen_info;
+ ops[OP_GET] = &fgetval_info;
+
+ be_state = IDENT_WAIT;
+}
diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h
new file mode 100644
index 0000000000..6dad244811
--- /dev/null
+++ b/usr/src/cmd/bhyve/fwctl.h
@@ -0,0 +1,56 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FWCTL_H_
+#define _FWCTL_H_
+
+#include <sys/linker_set.h>
+
+/*
+ * Linker set api for export of information to guest firmware via
+ * a sysctl-like OID interface
+ */
+struct ctl {
+ const char *c_oid;
+ const void *c_data;
+ const int c_len;
+};
+
+#define CTL_NODE(oid, data, len) \
+ static struct ctl __CONCAT(__ctl, __LINE__) = { \
+ oid, \
+ (data), \
+ (len), \
+ }; \
+ DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__))
+
+void fwctl_init(void);
+
+#endif /* _FWCTL_H_ */
diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c
new file mode 100644
index 0000000000..69bcf53c31
--- /dev/null
+++ b/usr/src/cmd/bhyve/gdb.c
@@ -0,0 +1,1332 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2018 John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <machine/atomic.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+#include <netinet/in.h>
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "mem.h"
+#include "mevent.h"
+
+/*
+ * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops
+ * use SIGTRAP.
+ */
+#define GDB_SIGNAL_TRAP 5
+
+static void gdb_resume_vcpus(void);
+static void check_command(int fd);
+
+static struct mevent *read_event, *write_event;
+
+static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting;
+static pthread_mutex_t gdb_lock;
+static pthread_cond_t idle_vcpus;
+static bool stop_pending, first_stop;
+#ifdef __FreeBSD__
+static int stepping_vcpu, stopped_vcpu;
+#else
+static int stepping_vcpu = -1, stopped_vcpu = -1;
+#endif
+
+/*
+ * An I/O buffer contains 'capacity' bytes of room at 'data'. For a
+ * read buffer, 'start' is unused and 'len' contains the number of
+ * valid bytes in the buffer. For a write buffer, 'start' is set to
+ * the index of the next byte in 'data' to send, and 'len' contains
+ * the remaining number of valid bytes to send.
+ */
+struct io_buffer {
+ uint8_t *data;
+ size_t capacity;
+ size_t start;
+ size_t len;
+};
+
+static struct io_buffer cur_comm, cur_resp;
+static uint8_t cur_csum;
+static int cur_vcpu;
+static struct vmctx *ctx;
+static int cur_fd = -1;
+
+const int gdb_regset[] = {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS
+};
+
+const int gdb_regsize[] = {
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4
+};
+
+#ifdef GDB_LOG
+#include <stdarg.h>
+#include <stdio.h>
+
+static void __printflike(1, 2)
+debug(const char *fmt, ...)
+{
+ static FILE *logfile;
+ va_list ap;
+
+ if (logfile == NULL) {
+ logfile = fopen("/tmp/bhyve_gdb.log", "w");
+ if (logfile == NULL)
+ return;
+#ifndef WITHOUT_CAPSICUM
+ if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) {
+ fclose(logfile);
+ logfile = NULL;
+ return;
+ }
+#endif
+ setlinebuf(logfile);
+ }
+ va_start(ap, fmt);
+ vfprintf(logfile, fmt, ap);
+ va_end(ap);
+}
+#else
+#define debug(...)
+#endif
+
+static int
+guest_paging_info(int vcpu, struct vm_guest_paging *paging)
+{
+ uint64_t regs[4];
+ const int regset[4] = {
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_EFER
+ };
+
+ if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1)
+ return (-1);
+
+ /*
+ * For the debugger, always pretend to be the kernel (CPL 0),
+ * and if long-mode is enabled, always parse addresses as if
+ * in 64-bit mode.
+ */
+ paging->cr3 = regs[1];
+ paging->cpl = 0;
+ if (regs[3] & EFER_LMA)
+ paging->cpu_mode = CPU_MODE_64BIT;
+ else if (regs[0] & CR0_PE)
+ paging->cpu_mode = CPU_MODE_PROTECTED;
+ else
+ paging->cpu_mode = CPU_MODE_REAL;
+ if (!(regs[0] & CR0_PG))
+ paging->paging_mode = PAGING_MODE_FLAT;
+ else if (!(regs[2] & CR4_PAE))
+ paging->paging_mode = PAGING_MODE_32;
+ else if (regs[3] & EFER_LME)
+ paging->paging_mode = PAGING_MODE_64;
+ else
+ paging->paging_mode = PAGING_MODE_PAE;
+ return (0);
+}
+
+/*
+ * Map a guest virtual address to a physical address (for a given vcpu).
+ * If a guest virtual address is valid, return 1. If the address is
+ * not valid, return 0. If an error occurs obtaining the mapping,
+ * return -1.
+ */
+static int
+guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr)
+{
+ struct vm_guest_paging paging;
+ int fault;
+
+ if (guest_paging_info(vcpu, &paging) == -1)
+ return (-1);
+
+ /*
+ * Always use PROT_READ. We really care if the VA is
+ * accessible, not if the current vCPU can write.
+ */
+ if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr,
+ &fault) == -1)
+ return (-1);
+ if (fault)
+ return (0);
+ return (1);
+}
+
+static void
+io_buffer_reset(struct io_buffer *io)
+{
+
+ io->start = 0;
+ io->len = 0;
+}
+
+/* Available room for adding data. */
+static size_t
+io_buffer_avail(struct io_buffer *io)
+{
+
+ return (io->capacity - (io->start + io->len));
+}
+
+static uint8_t *
+io_buffer_head(struct io_buffer *io)
+{
+
+ return (io->data + io->start);
+}
+
+static uint8_t *
+io_buffer_tail(struct io_buffer *io)
+{
+
+ return (io->data + io->start + io->len);
+}
+
+static void
+io_buffer_advance(struct io_buffer *io, size_t amount)
+{
+
+ assert(amount <= io->len);
+ io->start += amount;
+ io->len -= amount;
+}
+
+static void
+io_buffer_consume(struct io_buffer *io, size_t amount)
+{
+
+ io_buffer_advance(io, amount);
+ if (io->len == 0) {
+ io->start = 0;
+ return;
+ }
+
+ /*
+ * XXX: Consider making this move optional and compacting on a
+ * future read() before realloc().
+ */
+ memmove(io->data, io_buffer_head(io), io->len);
+ io->start = 0;
+}
+
+static void
+io_buffer_grow(struct io_buffer *io, size_t newsize)
+{
+ uint8_t *new_data;
+ size_t avail, new_cap;
+
+ avail = io_buffer_avail(io);
+ if (newsize <= avail)
+ return;
+
+ new_cap = io->capacity + (newsize - avail);
+ new_data = realloc(io->data, new_cap);
+ if (new_data == NULL)
+ err(1, "Failed to grow GDB I/O buffer");
+ io->data = new_data;
+ io->capacity = new_cap;
+}
+
+static bool
+response_pending(void)
+{
+
+ if (cur_resp.start == 0 && cur_resp.len == 0)
+ return (false);
+ if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+')
+ return (false);
+ return (true);
+}
+
+static void
+close_connection(void)
+{
+
+ /*
+ * XXX: This triggers a warning because mevent does the close
+ * before the EV_DELETE.
+ */
+ pthread_mutex_lock(&gdb_lock);
+ mevent_delete(write_event);
+ mevent_delete_close(read_event);
+ write_event = NULL;
+ read_event = NULL;
+ io_buffer_reset(&cur_comm);
+ io_buffer_reset(&cur_resp);
+ cur_fd = -1;
+
+ /* Resume any stopped vCPUs. */
+ gdb_resume_vcpus();
+ pthread_mutex_unlock(&gdb_lock);
+}
+
+static uint8_t
+hex_digit(uint8_t nibble)
+{
+
+ if (nibble <= 9)
+ return (nibble + '0');
+ else
+ return (nibble + 'a' - 10);
+}
+
+static uint8_t
+parse_digit(uint8_t v)
+{
+
+ if (v >= '0' && v <= '9')
+ return (v - '0');
+ if (v >= 'a' && v <= 'f')
+ return (v - 'a' + 10);
+ if (v >= 'A' && v <= 'F')
+ return (v - 'A' + 10);
+ return (0xF);
+}
+
+/* Parses big-endian hexadecimal. */
+static uintmax_t
+parse_integer(const uint8_t *p, size_t len)
+{
+ uintmax_t v;
+
+ v = 0;
+ while (len > 0) {
+ v <<= 4;
+ v |= parse_digit(*p);
+ p++;
+ len--;
+ }
+ return (v);
+}
+
+static uint8_t
+parse_byte(const uint8_t *p)
+{
+
+ return (parse_digit(p[0]) << 4 | parse_digit(p[1]));
+}
+
+static void
+send_pending_data(int fd)
+{
+ ssize_t nwritten;
+
+ if (cur_resp.len == 0) {
+ mevent_disable(write_event);
+ return;
+ }
+ nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len);
+ if (nwritten == -1) {
+ warn("Write to GDB socket failed");
+ close_connection();
+ } else {
+ io_buffer_advance(&cur_resp, nwritten);
+ if (cur_resp.len == 0)
+ mevent_disable(write_event);
+ else
+ mevent_enable(write_event);
+ }
+}
+
+/* Append a single character to the output buffer. */
+static void
+send_char(uint8_t data)
+{
+ io_buffer_grow(&cur_resp, 1);
+ *io_buffer_tail(&cur_resp) = data;
+ cur_resp.len++;
+}
+
+/* Append an array of bytes to the output buffer. */
+static void
+send_data(const uint8_t *data, size_t len)
+{
+
+ io_buffer_grow(&cur_resp, len);
+ memcpy(io_buffer_tail(&cur_resp), data, len);
+ cur_resp.len += len;
+}
+
+static void
+format_byte(uint8_t v, uint8_t *buf)
+{
+
+ buf[0] = hex_digit(v >> 4);
+ buf[1] = hex_digit(v & 0xf);
+}
+
+/*
+ * Append a single byte (formatted as two hex characters) to the
+ * output buffer.
+ */
+static void
+send_byte(uint8_t v)
+{
+ uint8_t buf[2];
+
+ format_byte(v, buf);
+ send_data(buf, sizeof(buf));
+}
+
+static void
+start_packet(void)
+{
+
+ send_char('$');
+ cur_csum = 0;
+}
+
+static void
+finish_packet(void)
+{
+
+ send_char('#');
+ send_byte(cur_csum);
+ debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp));
+}
+
+/*
+ * Append a single character (for the packet payload) and update the
+ * checksum.
+ */
+static void
+append_char(uint8_t v)
+{
+
+ send_char(v);
+ cur_csum += v;
+}
+
+/*
+ * Append an array of bytes (for the packet payload) and update the
+ * checksum.
+ */
+static void
+append_packet_data(const uint8_t *data, size_t len)
+{
+
+ send_data(data, len);
+ while (len > 0) {
+ cur_csum += *data;
+ data++;
+ len--;
+ }
+}
+
+static void
+append_string(const char *str)
+{
+
+#ifdef __FreeBSD__
+ append_packet_data(str, strlen(str));
+#else
+ append_packet_data((const uint8_t *)str, strlen(str));
+#endif
+}
+
+static void
+append_byte(uint8_t v)
+{
+ uint8_t buf[2];
+
+ format_byte(v, buf);
+ append_packet_data(buf, sizeof(buf));
+}
+
+static void
+append_unsigned_native(uintmax_t value, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ append_byte(value);
+ value >>= 8;
+ }
+}
+
+static void
+append_unsigned_be(uintmax_t value, size_t len)
+{
+ char buf[len * 2];
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+#ifdef __FreeBSD__
+ format_byte(value, buf + (len - i - 1) * 2);
+#else
+ format_byte(value, (uint8_t *)(buf + (len - i - 1) * 2));
+#endif
+ value >>= 8;
+ }
+#ifdef __FreeBSD__
+ append_packet_data(buf, sizeof(buf));
+#else
+ append_packet_data((const uint8_t *)buf, sizeof(buf));
+#endif
+}
+
+static void
+append_integer(unsigned int value)
+{
+
+ if (value == 0)
+ append_char('0');
+ else
+ append_unsigned_be(value, fls(value) + 7 / 8);
+}
+
+static void
+append_asciihex(const char *str)
+{
+
+ while (*str != '\0') {
+ append_byte(*str);
+ str++;
+ }
+}
+
+static void
+send_empty_response(void)
+{
+
+ start_packet();
+ finish_packet();
+}
+
+static void
+send_error(int error)
+{
+
+ start_packet();
+ append_char('E');
+ append_byte(error);
+ finish_packet();
+}
+
+static void
+send_ok(void)
+{
+
+ start_packet();
+ append_string("OK");
+ finish_packet();
+}
+
+static int
+parse_threadid(const uint8_t *data, size_t len)
+{
+
+ if (len == 1 && *data == '0')
+ return (0);
+ if (len == 2 && memcmp(data, "-1", 2) == 0)
+ return (-1);
+ if (len == 0)
+ return (-2);
+ return (parse_integer(data, len));
+}
+
+static void
+report_stop(void)
+{
+
+ start_packet();
+ if (stopped_vcpu == -1)
+ append_char('S');
+ else
+ append_char('T');
+ append_byte(GDB_SIGNAL_TRAP);
+ if (stopped_vcpu != -1) {
+ append_string("thread:");
+ append_integer(stopped_vcpu + 1);
+ append_char(';');
+ }
+ stopped_vcpu = -1;
+ finish_packet();
+}
+
+static void
+gdb_finish_suspend_vcpus(void)
+{
+
+ if (first_stop) {
+ first_stop = false;
+ stopped_vcpu = -1;
+ } else if (response_pending())
+ stop_pending = true;
+ else {
+ report_stop();
+ send_pending_data(cur_fd);
+ }
+}
+
+static void
+_gdb_cpu_suspend(int vcpu, bool report_stop)
+{
+
+ debug("$vCPU %d suspending\n", vcpu);
+ CPU_SET(vcpu, &vcpus_waiting);
+ if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0)
+ gdb_finish_suspend_vcpus();
+ while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu)
+ pthread_cond_wait(&idle_vcpus, &gdb_lock);
+ CPU_CLR(vcpu, &vcpus_waiting);
+ debug("$vCPU %d resuming\n", vcpu);
+}
+
+void
+gdb_cpu_add(int vcpu)
+{
+
+ debug("$vCPU %d starting\n", vcpu);
+ pthread_mutex_lock(&gdb_lock);
+ CPU_SET(vcpu, &vcpus_active);
+
+ /*
+ * If a vcpu is added while vcpus are stopped, suspend the new
+ * vcpu so that it will pop back out with a debug exit before
+ * executing the first instruction.
+ */
+ if (!CPU_EMPTY(&vcpus_suspended)) {
+ CPU_SET(vcpu, &vcpus_suspended);
+ _gdb_cpu_suspend(vcpu, false);
+ }
+ pthread_mutex_unlock(&gdb_lock);
+}
+
+void
+gdb_cpu_suspend(int vcpu)
+{
+
+ pthread_mutex_lock(&gdb_lock);
+ _gdb_cpu_suspend(vcpu, true);
+ pthread_mutex_unlock(&gdb_lock);
+}
+
+void
+gdb_cpu_mtrap(int vcpu)
+{
+
+ debug("$vCPU %d MTRAP\n", vcpu);
+ pthread_mutex_lock(&gdb_lock);
+ if (vcpu == stepping_vcpu) {
+ stepping_vcpu = -1;
+ vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
+ vm_suspend_cpu(ctx, vcpu);
+ assert(stopped_vcpu == -1);
+ stopped_vcpu = vcpu;
+ _gdb_cpu_suspend(vcpu, true);
+ }
+ pthread_mutex_unlock(&gdb_lock);
+}
+
+static void
+gdb_suspend_vcpus(void)
+{
+
+ assert(pthread_mutex_isowned_np(&gdb_lock));
+ debug("suspending all CPUs\n");
+ vcpus_suspended = vcpus_active;
+ vm_suspend_cpu(ctx, -1);
+ if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0)
+ gdb_finish_suspend_vcpus();
+}
+
+static bool
+gdb_step_vcpu(int vcpu)
+{
+ int error, val;
+
+ debug("$vCPU %d step\n", vcpu);
+ error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val);
+ if (error < 0)
+ return (false);
+ error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
+ vm_resume_cpu(ctx, vcpu);
+ stepping_vcpu = vcpu;
+ pthread_cond_broadcast(&idle_vcpus);
+ return (true);
+}
+
+static void
+gdb_resume_vcpus(void)
+{
+
+ assert(pthread_mutex_isowned_np(&gdb_lock));
+ vm_resume_cpu(ctx, -1);
+ debug("resuming all CPUs\n");
+ CPU_ZERO(&vcpus_suspended);
+ pthread_cond_broadcast(&idle_vcpus);
+}
+
+static void
+gdb_read_regs(void)
+{
+ uint64_t regvals[nitems(gdb_regset)];
+ int i;
+
+ if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset),
+ gdb_regset, regvals) == -1) {
+ send_error(errno);
+ return;
+ }
+ start_packet();
+ for (i = 0; i < nitems(regvals); i++)
+ append_unsigned_native(regvals[i], gdb_regsize[i]);
+ finish_packet();
+}
+
+static void
+gdb_read_mem(const uint8_t *data, size_t len)
+{
+ uint64_t gpa, gva, val;
+ uint8_t *cp;
+ size_t resid, todo, bytes;
+ bool started;
+ int error;
+
+ cp = memchr(data, ',', len);
+ if (cp == NULL) {
+ send_error(EINVAL);
+ return;
+ }
+ gva = parse_integer(data + 1, cp - (data + 1));
+ resid = parse_integer(cp + 1, len - (cp + 1 - data));
+ started = false;
+
+ while (resid > 0) {
+ error = guest_vaddr2paddr(cur_vcpu, gva, &gpa);
+ if (error == -1) {
+ if (started)
+ finish_packet();
+ else
+ send_error(errno);
+ return;
+ }
+ if (error == 0) {
+ if (started)
+ finish_packet();
+ else
+ send_error(EFAULT);
+ return;
+ }
+
+ /* Read bytes from current page. */
+ todo = getpagesize() - gpa % getpagesize();
+ if (todo > resid)
+ todo = resid;
+
+ cp = paddr_guest2host(ctx, gpa, todo);
+ if (cp != NULL) {
+ /*
+ * If this page is guest RAM, read it a byte
+ * at a time.
+ */
+ if (!started) {
+ start_packet();
+ started = true;
+ }
+ while (todo > 0) {
+ append_byte(*cp);
+ cp++;
+ gpa++;
+ gva++;
+ resid--;
+ todo--;
+ }
+ } else {
+ /*
+ * If this page isn't guest RAM, try to handle
+ * it via MMIO. For MMIO requests, use
+ * aligned reads of words when possible.
+ */
+ while (todo > 0) {
+ if (gpa & 1 || todo == 1)
+ bytes = 1;
+ else if (gpa & 2 || todo == 2)
+ bytes = 2;
+ else
+ bytes = 4;
+ error = read_mem(ctx, cur_vcpu, gpa, &val,
+ bytes);
+ if (error == 0) {
+ if (!started) {
+ start_packet();
+ started = true;
+ }
+ gpa += bytes;
+ gva += bytes;
+ resid -= bytes;
+ todo -= bytes;
+ while (bytes > 0) {
+ append_byte(val);
+ val >>= 8;
+ bytes--;
+ }
+ } else {
+ if (started)
+ finish_packet();
+ else
+ send_error(EFAULT);
+ return;
+ }
+ }
+ }
+ assert(resid == 0 || gpa % getpagesize() == 0);
+ }
+ if (!started)
+ start_packet();
+ finish_packet();
+}
+
+static bool
+command_equals(const uint8_t *data, size_t len, const char *cmd)
+{
+
+ if (strlen(cmd) > len)
+ return (false);
+ return (memcmp(data, cmd, strlen(cmd)) == 0);
+}
+
+static void
+gdb_query(const uint8_t *data, size_t len)
+{
+
+ /*
+ * TODO:
+ * - qSearch
+ * - qSupported
+ */
+ if (command_equals(data, len, "qAttached")) {
+ start_packet();
+ append_char('1');
+ finish_packet();
+ } else if (command_equals(data, len, "qC")) {
+ start_packet();
+ append_string("QC");
+ append_integer(cur_vcpu + 1);
+ finish_packet();
+ } else if (command_equals(data, len, "qfThreadInfo")) {
+ cpuset_t mask;
+ bool first;
+ int vcpu;
+
+ if (CPU_EMPTY(&vcpus_active)) {
+ send_error(EINVAL);
+ return;
+ }
+ mask = vcpus_active;
+ start_packet();
+ append_char('m');
+ first = true;
+ while (!CPU_EMPTY(&mask)) {
+ vcpu = CPU_FFS(&mask) - 1;
+ CPU_CLR(vcpu, &mask);
+ if (first)
+ first = false;
+ else
+ append_char(',');
+ append_integer(vcpu + 1);
+ }
+ finish_packet();
+ } else if (command_equals(data, len, "qsThreadInfo")) {
+ start_packet();
+ append_char('l');
+ finish_packet();
+ } else if (command_equals(data, len, "qThreadExtraInfo")) {
+ char buf[16];
+ int tid;
+
+ data += strlen("qThreadExtraInfo");
+ len -= strlen("qThreadExtraInfo");
+ if (*data != ',') {
+ send_error(EINVAL);
+ return;
+ }
+ tid = parse_threadid(data + 1, len - 1);
+ if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) {
+ send_error(EINVAL);
+ return;
+ }
+
+ snprintf(buf, sizeof(buf), "vCPU %d", tid - 1);
+ start_packet();
+ append_asciihex(buf);
+ finish_packet();
+ } else
+ send_empty_response();
+}
+
+static void
+handle_command(const uint8_t *data, size_t len)
+{
+
+ /* Reject packets with a sequence-id. */
+ if (len >= 3 && data[0] >= '0' && data[0] <= '9' &&
+ data[0] >= '0' && data[0] <= '9' && data[2] == ':') {
+ send_empty_response();
+ return;
+ }
+
+ switch (*data) {
+ case 'c':
+ if (len != 1) {
+ send_error(EINVAL);
+ break;
+ }
+
+ /* Don't send a reply until a stop occurs. */
+ gdb_resume_vcpus();
+ break;
+ case 'D':
+ send_ok();
+
+ /* TODO: Resume any stopped CPUs. */
+ break;
+ case 'g': {
+ gdb_read_regs();
+ break;
+ }
+ case 'H': {
+ int tid;
+
+ if (data[1] != 'g' && data[1] != 'c') {
+ send_error(EINVAL);
+ break;
+ }
+ tid = parse_threadid(data + 2, len - 2);
+ if (tid == -2) {
+ send_error(EINVAL);
+ break;
+ }
+
+ if (CPU_EMPTY(&vcpus_active)) {
+ send_error(EINVAL);
+ break;
+ }
+ if (tid == -1 || tid == 0)
+ cur_vcpu = CPU_FFS(&vcpus_active) - 1;
+ else if (CPU_ISSET(tid - 1, &vcpus_active))
+ cur_vcpu = tid - 1;
+ else {
+ send_error(EINVAL);
+ break;
+ }
+ send_ok();
+ break;
+ }
+ case 'm':
+ gdb_read_mem(data, len);
+ break;
+ case 'T': {
+ int tid;
+
+ tid = parse_threadid(data + 1, len - 1);
+ if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) {
+ send_error(EINVAL);
+ return;
+ }
+ send_ok();
+ break;
+ }
+ case 'q':
+ gdb_query(data, len);
+ break;
+ case 's':
+ if (len != 1) {
+ send_error(EINVAL);
+ break;
+ }
+
+ /* Don't send a reply until a stop occurs. */
+ if (!gdb_step_vcpu(cur_vcpu)) {
+ send_error(EOPNOTSUPP);
+ break;
+ }
+ break;
+ case '?':
+ /* XXX: Only if stopped? */
+ /* For now, just report that we are always stopped. */
+ start_packet();
+ append_char('S');
+ append_byte(GDB_SIGNAL_TRAP);
+ finish_packet();
+ break;
+ case 'G': /* TODO */
+ case 'M': /* TODO */
+ case 'v':
+ /* Handle 'vCont' */
+ /* 'vCtrlC' */
+ case 'p': /* TODO */
+ case 'P': /* TODO */
+ case 'Q': /* TODO */
+ case 't': /* TODO */
+ case 'X': /* TODO */
+ case 'z': /* TODO */
+ case 'Z': /* TODO */
+ default:
+ send_empty_response();
+ }
+}
+
+/* Check for a valid packet in the command buffer. */
+static void
+check_command(int fd)
+{
+ uint8_t *head, *hash, *p, sum;
+ size_t avail, plen;
+
+ for (;;) {
+ avail = cur_comm.len;
+ if (avail == 0)
+ return;
+ head = io_buffer_head(&cur_comm);
+ switch (*head) {
+ case 0x03:
+ debug("<- Ctrl-C\n");
+ io_buffer_consume(&cur_comm, 1);
+
+ gdb_suspend_vcpus();
+ break;
+ case '+':
+ /* ACK of previous response. */
+ debug("<- +\n");
+ if (response_pending())
+ io_buffer_reset(&cur_resp);
+ io_buffer_consume(&cur_comm, 1);
+ if (stop_pending) {
+ stop_pending = false;
+ report_stop();
+ send_pending_data(fd);
+ }
+ break;
+ case '-':
+ /* NACK of previous response. */
+ debug("<- -\n");
+ if (response_pending()) {
+ cur_resp.len += cur_resp.start;
+ cur_resp.start = 0;
+ if (cur_resp.data[0] == '+')
+ io_buffer_advance(&cur_resp, 1);
+ debug("-> %.*s\n", (int)cur_resp.len,
+ io_buffer_head(&cur_resp));
+ }
+ io_buffer_consume(&cur_comm, 1);
+ send_pending_data(fd);
+ break;
+ case '$':
+ /* Packet. */
+
+ if (response_pending()) {
+ warnx("New GDB command while response in "
+ "progress");
+ io_buffer_reset(&cur_resp);
+ }
+
+ /* Is packet complete? */
+ hash = memchr(head, '#', avail);
+ if (hash == NULL)
+ return;
+ plen = (hash - head + 1) + 2;
+ if (avail < plen)
+ return;
+ debug("<- %.*s\n", (int)plen, head);
+
+ /* Verify checksum. */
+ for (sum = 0, p = head + 1; p < hash; p++)
+ sum += *p;
+ if (sum != parse_byte(hash + 1)) {
+ io_buffer_consume(&cur_comm, plen);
+ debug("-> -\n");
+ send_char('-');
+ send_pending_data(fd);
+ break;
+ }
+ send_char('+');
+
+ handle_command(head + 1, hash - (head + 1));
+ io_buffer_consume(&cur_comm, plen);
+ if (!response_pending()) {
+ debug("-> +\n");
+ }
+ send_pending_data(fd);
+ break;
+ default:
+ /* XXX: Possibly drop connection instead. */
+ debug("-> %02x\n", *head);
+ io_buffer_consume(&cur_comm, 1);
+ break;
+ }
+ }
+}
+
+static void
+gdb_readable(int fd, enum ev_type event, void *arg)
+{
+ ssize_t nread;
+ int pending;
+
+ if (ioctl(fd, FIONREAD, &pending) == -1) {
+ warn("FIONREAD on GDB socket");
+ return;
+ }
+
+ /*
+ * 'pending' might be zero due to EOF. We need to call read
+ * with a non-zero length to detect EOF.
+ */
+ if (pending == 0)
+ pending = 1;
+
+ /* Ensure there is room in the command buffer. */
+ io_buffer_grow(&cur_comm, pending);
+ assert(io_buffer_avail(&cur_comm) >= pending);
+
+ nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm));
+ if (nread == 0) {
+ close_connection();
+ } else if (nread == -1) {
+ if (errno == EAGAIN)
+ return;
+
+ warn("Read from GDB socket");
+ close_connection();
+ } else {
+ cur_comm.len += nread;
+ pthread_mutex_lock(&gdb_lock);
+ check_command(fd);
+ pthread_mutex_unlock(&gdb_lock);
+ }
+}
+
+static void
+gdb_writable(int fd, enum ev_type event, void *arg)
+{
+
+ send_pending_data(fd);
+}
+
+static void
+new_connection(int fd, enum ev_type event, void *arg)
+{
+ int optval, s;
+
+ s = accept4(fd, NULL, NULL, SOCK_NONBLOCK);
+ if (s == -1) {
+ if (arg != NULL)
+ err(1, "Failed accepting initial GDB connection");
+
+ /* Silently ignore errors post-startup. */
+ return;
+ }
+
+ optval = 1;
+ if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) ==
+ -1) {
+ warn("Failed to disable SIGPIPE for GDB connection");
+ close(s);
+ return;
+ }
+
+ pthread_mutex_lock(&gdb_lock);
+ if (cur_fd != -1) {
+ close(s);
+ warnx("Ignoring additional GDB connection.");
+ }
+
+ read_event = mevent_add(s, EVF_READ, gdb_readable, NULL);
+ if (read_event == NULL) {
+ if (arg != NULL)
+ err(1, "Failed to setup initial GDB connection");
+ pthread_mutex_unlock(&gdb_lock);
+ return;
+ }
+ write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL);
+ if (write_event == NULL) {
+ if (arg != NULL)
+ err(1, "Failed to setup initial GDB connection");
+ mevent_delete_close(read_event);
+ read_event = NULL;
+ }
+
+ cur_fd = s;
+ cur_vcpu = 0;
+ stepping_vcpu = -1;
+ stopped_vcpu = -1;
+ stop_pending = false;
+
+ /* Break on attach. */
+ first_stop = true;
+ gdb_suspend_vcpus();
+ pthread_mutex_unlock(&gdb_lock);
+}
+
+#ifndef WITHOUT_CAPSICUM
+void
+limit_gdb_socket(int s)
+{
+ cap_rights_t rights;
+ unsigned long ioctls[] = { FIONREAD };
+
+ cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE,
+ CAP_SETSOCKOPT, CAP_IOCTL);
+ if (caph_rights_limit(s, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+}
+#endif
+
+void
+init_gdb(struct vmctx *_ctx, int sport, bool wait)
+{
+ struct sockaddr_in sin;
+ int error, flags, s;
+
+ debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not ");
+
+ error = pthread_mutex_init(&gdb_lock, NULL);
+ if (error != 0)
+ errc(1, error, "gdb mutex init");
+ error = pthread_cond_init(&idle_vcpus, NULL);
+ if (error != 0)
+ errc(1, error, "gdb cv init");
+
+ ctx = _ctx;
+ s = socket(PF_INET, SOCK_STREAM, 0);
+ if (s < 0)
+ err(1, "gdb socket create");
+
+#ifdef __FreeBSD__
+ sin.sin_len = sizeof(sin);
+#endif
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0)
+ err(1, "gdb socket bind");
+
+ if (listen(s, 1) < 0)
+ err(1, "gdb socket listen");
+
+ if (wait) {
+ /*
+ * Set vcpu 0 in vcpus_suspended. This will trigger the
+ * logic in gdb_cpu_add() to suspend the first vcpu before
+ * it starts execution. The vcpu will remain suspended
+ * until a debugger connects.
+ */
+ stepping_vcpu = -1;
+ stopped_vcpu = -1;
+ CPU_SET(0, &vcpus_suspended);
+ }
+
+ flags = fcntl(s, F_GETFL);
+ if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1)
+ err(1, "Failed to mark gdb socket non-blocking");
+
+#ifndef WITHOUT_CAPSICUM
+ limit_gdb_socket(s);
+#endif
+ mevent_add(s, EVF_READ, new_connection, NULL);
+}
diff --git a/usr/src/cmd/bhyve/gdb.h b/usr/src/cmd/bhyve/gdb.h
new file mode 100644
index 0000000000..fa2184df16
--- /dev/null
+++ b/usr/src/cmd/bhyve/gdb.h
@@ -0,0 +1,39 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017 John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __GDB_H__
+#define __GDB_H__
+
+void gdb_cpu_add(int vcpu);
+void gdb_cpu_mtrap(int vcpu);
+void gdb_cpu_suspend(int vcpu);
+void init_gdb(struct vmctx *ctx, int sport, bool wait);
+
+#endif /* !__GDB_H__ */
diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c
new file mode 100644
index 0000000000..b460ee2988
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.c
@@ -0,0 +1,299 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+#define VERIFY_IOPORT(port, size) \
+ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (in) {
+ switch (bytes) {
+ case 4:
+ *eax = 0xffffffff;
+ break;
+ case 2:
+ *eax = 0xffff;
+ break;
+ case 1:
+ *eax = 0xff;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static void
+register_default_iohandler(int start, int size)
+{
+ struct inout_port iop;
+
+ VERIFY_IOPORT(start, size);
+
+ bzero(&iop, sizeof(iop));
+ iop.name = "default";
+ iop.port = start;
+ iop.size = size;
+ iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT;
+ iop.handler = default_inout;
+
+ register_inout(&iop);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
+{
+ int addrsize, bytes, flags, in, port, prot, rep;
+ uint32_t eax, val;
+ inout_func_t handler;
+ void *arg;
+ int error, fault, retval;
+ enum vm_reg_name idxreg;
+ uint64_t gla, index, iterations, count;
+ struct vm_inout_str *vis;
+ struct iovec iov[2];
+
+ bytes = vmexit->u.inout.bytes;
+ in = vmexit->u.inout.in;
+ port = vmexit->u.inout.port;
+
+ assert(port < MAX_IOPORTS);
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ handler = inout_handlers[port].handler;
+
+ if (strict && handler == default_inout)
+ return (-1);
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if (in) {
+ if (!(flags & IOPORT_F_IN))
+ return (-1);
+ } else {
+ if (!(flags & IOPORT_F_OUT))
+ return (-1);
+ }
+
+ retval = 0;
+ if (vmexit->u.inout.string) {
+ vis = &vmexit->u.inout_str;
+ rep = vis->inout.rep;
+ addrsize = vis->addrsize;
+ prot = in ? PROT_WRITE : PROT_READ;
+ assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
+
+ /* Index register */
+ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+ index = vis->index & vie_size2mask(addrsize);
+
+ /* Count register */
+ count = vis->count & vie_size2mask(addrsize);
+
+ /* Limit number of back-to-back in/out emulations to 16 */
+ iterations = MIN(count, 16);
+ while (iterations > 0) {
+ assert(retval == 0);
+ if (vie_calculate_gla(vis->paging.cpu_mode,
+ vis->seg_name, &vis->seg_desc, index, bytes,
+ addrsize, prot, &gla)) {
+ vm_inject_gp(ctx, vcpu);
+ break;
+ }
+
+ error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+ bytes, prot, iov, nitems(iov), &fault);
+ if (error) {
+ retval = -1; /* Unrecoverable error */
+ break;
+ } else if (fault) {
+ retval = 0; /* Resume guest to handle fault */
+ break;
+ }
+
+ if (vie_alignment_check(vis->paging.cpl, bytes,
+ vis->cr0, vis->rflags, gla)) {
+ vm_inject_ac(ctx, vcpu, 0);
+ break;
+ }
+
+ val = 0;
+ if (!in)
+ vm_copyin(ctx, vcpu, iov, &val, bytes);
+
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval != 0)
+ break;
+
+ if (in)
+ vm_copyout(ctx, vcpu, &val, iov, bytes);
+
+ /* Update index */
+ if (vis->rflags & PSL_D)
+ index -= bytes;
+ else
+ index += bytes;
+
+ count--;
+ iterations--;
+ }
+
+ /* Update index register */
+ error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
+ assert(error == 0);
+
+ /*
+ * Update count register only if the instruction had a repeat
+ * prefix.
+ */
+ if (rep) {
+ error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
+ count, addrsize);
+ assert(error == 0);
+ }
+
+ /* Restart the instruction if more iterations remain */
+ if (retval == 0 && count != 0) {
+ error = vm_restart_instruction(ctx, vcpu);
+ assert(error == 0);
+ }
+ } else {
+ eax = vmexit->u.inout.eax;
+ val = eax & vie_size2mask(bytes);
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval == 0 && in) {
+ eax &= ~vie_size2mask(bytes);
+ eax |= val & vie_size2mask(bytes);
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
+ eax);
+ assert(error == 0);
+ }
+ }
+ return (retval);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+
+ /*
+ * Set up the default handler for all ports
+ */
+ register_default_iohandler(0, MAX_IOPORTS);
+
+ /*
+ * Overwrite with specified handlers
+ */
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ int i;
+
+ VERIFY_IOPORT(iop->port, iop->size);
+
+ /*
+ * Verify that the new registration is not overwriting an already
+ * allocated i/o range.
+ */
+ if ((iop->flags & IOPORT_F_DEFAULT) == 0) {
+ for (i = iop->port; i < iop->port + iop->size; i++) {
+ if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0)
+ return (-1);
+ }
+ }
+
+ for (i = iop->port; i < iop->port + iop->size; i++) {
+ inout_handlers[i].name = iop->name;
+ inout_handlers[i].flags = iop->flags;
+ inout_handlers[i].handler = iop->handler;
+ inout_handlers[i].arg = iop->arg;
+ }
+
+ return (0);
+}
+
+int
+unregister_inout(struct inout_port *iop)
+{
+
+ VERIFY_IOPORT(iop->port, iop->size);
+ assert(inout_handlers[iop->port].name == iop->name);
+
+ register_default_iohandler(iop->port, iop->size);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h
new file mode 100644
index 0000000000..b72ee5d93e
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.h
@@ -0,0 +1,93 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+struct vm_exit;
+
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int size;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT)
+
+/*
+ * The following flags are used internally and must not be used by
+ * device models.
+ */
+#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ 1, \
+ (flags), \
+ (handler), \
+ 0 \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
+ int strict);
+int register_inout(struct inout_port *iop);
+int unregister_inout(struct inout_port *iop);
+void init_bvmcons(void);
+
+#endif /* _INOUT_H_ */
diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c
new file mode 100644
index 0000000000..acdbb5111b
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.c
@@ -0,0 +1,83 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <stdio.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "ioapic.h"
+#include "pci_emul.h"
+#include "pci_lpc.h"
+
+/*
+ * Assign PCI INTx interrupts to I/O APIC pins in a round-robin
+ * fashion. Note that we have no idea what the HPET is using, but the
+ * HPET is also programmable whereas this is intended for hardwired
+ * PCI interrupts.
+ *
+ * This assumes a single I/O APIC where pins >= 16 are permitted for
+ * PCI devices.
+ */
+static int pci_pins;
+
+void
+ioapic_init(struct vmctx *ctx)
+{
+
+ if (vm_ioapic_pincount(ctx, &pci_pins) < 0) {
+ pci_pins = 0;
+ return;
+ }
+
+ /* Ignore the first 16 pins. */
+ if (pci_pins <= 16) {
+ pci_pins = 0;
+ return;
+ }
+ pci_pins -= 16;
+}
+
+int
+ioapic_pci_alloc_irq(struct pci_devinst *pi)
+{
+ static int last_pin;
+
+ if (pci_pins == 0)
+ return (-1);
+ if (lpc_bootrom()) {
+ /* For external bootrom use fixed mapping. */
+ return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8);
+ }
+ return (16 + (last_pin++ % pci_pins));
+}
diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h
new file mode 100644
index 0000000000..3a7fa76192
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOAPIC_H_
+#define _IOAPIC_H_
+
+struct pci_devinst;
+
+/*
+ * Allocate a PCI IRQ from the I/O APIC.
+ */
+void ioapic_init(struct vmctx *ctx);
+int ioapic_pci_alloc_irq(struct pci_devinst *pi);
+
+#endif
diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c
new file mode 100644
index 0000000000..54ea22aa94
--- /dev/null
+++ b/usr/src/cmd/bhyve/iov.c
@@ -0,0 +1,148 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include "iov.h"
+
+void
+seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2,
+ size_t seek)
+{
+ size_t remainder = 0;
+ size_t left = seek;
+ int i, j;
+
+ for (i = 0; i < niov1; i++) {
+ size_t toseek = MIN(left, iov1[i].iov_len);
+ left -= toseek;
+
+ if (toseek == iov1[i].iov_len)
+ continue;
+
+ if (left == 0) {
+ remainder = toseek;
+ break;
+ }
+ }
+
+ for (j = i; j < niov1; j++) {
+ iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
+ iov2[j - i].iov_len = iov1[j].iov_len - remainder;
+ remainder = 0;
+ }
+
+ *niov2 = j - i;
+}
+
+size_t
+count_iov(const struct iovec *iov, int niov)
+{
+ size_t total = 0;
+ int i;
+
+ for (i = 0; i < niov; i++)
+ total += iov[i].iov_len;
+
+ return (total);
+}
+
+void
+truncate_iov(struct iovec *iov, int *niov, size_t length)
+{
+ size_t done = 0;
+ int i;
+
+ for (i = 0; i < *niov; i++) {
+ size_t toseek = MIN(length - done, iov[i].iov_len);
+ done += toseek;
+
+ if (toseek <= iov[i].iov_len) {
+ iov[i].iov_len = toseek;
+ *niov = i + 1;
+ return;
+ }
+ }
+}
+
+ssize_t
+iov_to_buf(const struct iovec *iov, int niov, void **buf)
+{
+ size_t ptr, total;
+ int i;
+
+ total = count_iov(iov, niov);
+ *buf = realloc(*buf, total);
+ if (*buf == NULL)
+ return (-1);
+
+ for (i = 0, ptr = 0; i < niov; i++) {
+ memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len);
+ ptr += iov[i].iov_len;
+ }
+
+ return (total);
+}
+
+ssize_t
+buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+ size_t seek)
+{
+ struct iovec *diov;
+ int ndiov, i;
+ size_t off = 0, len;
+
+ if (seek > 0) {
+ diov = malloc(sizeof(struct iovec) * niov);
+ seek_iov(iov, niov, diov, &ndiov, seek);
+ } else {
+ diov = iov;
+ ndiov = niov;
+ }
+
+ for (i = 0; i < ndiov && off < buflen; i++) {
+ len = MIN(diov[i].iov_len, buflen - off);
+ memcpy(diov[i].iov_base, buf + off, len);
+ off += len;
+ }
+
+ if (seek > 0)
+ free(diov);
+
+ return ((ssize_t)off);
+}
+
diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h
new file mode 100644
index 0000000000..e3b5916edb
--- /dev/null
+++ b/usr/src/cmd/bhyve/iov.h
@@ -0,0 +1,44 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOV_H_
+#define _IOV_H_
+
+void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
+ int *niov2, size_t seek);
+void truncate_iov(struct iovec *iov, int *niov, size_t length);
+size_t count_iov(const struct iovec *iov, int niov);
+ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
+ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+ size_t seek);
+
+#endif /* _IOV_H_ */
diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c
new file mode 100644
index 0000000000..85e56af10b
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.c
@@ -0,0 +1,361 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/tree.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <assert.h>
+#include <err.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+ RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
+ struct mem_range mr_param;
+ uint64_t mr_base;
+ uint64_t mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
+
+static pthread_rwlock_t mmio_rwlock;
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+ if (a->mr_end < b->mr_base)
+ return (-1);
+ else if (a->mr_base > b->mr_end)
+ return (1);
+ return (0);
+}
+
+static int
+mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
+ struct mmio_rb_range **entry)
+{
+ struct mmio_rb_range find, *res;
+
+ find.mr_base = find.mr_end = addr;
+
+ res = RB_FIND(mmio_rb_tree, rbt, &find);
+
+ if (res != NULL) {
+ *entry = res;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
+{
+ struct mmio_rb_range *overlap;
+
+ overlap = RB_INSERT(mmio_rb_tree, rbt, new);
+
+ if (overlap != NULL) {
+#ifdef RB_DEBUG
+ printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+ new->mr_base, new->mr_end,
+ overlap->mr_base, overlap->mr_end);
+#endif
+
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(struct mmio_rb_tree *rbt)
+{
+ int perror;
+ struct mmio_rb_range *np;
+
+ pthread_rwlock_rdlock(&mmio_rwlock);
+ RB_FOREACH(np, mmio_rb_tree, rbt) {
+ printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+ np->mr_param.name);
+ }
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa,
+ struct mem_range *mr, void *arg);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+ rval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+ &wval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb,
+ void *arg)
+{
+ struct mmio_rb_range *entry;
+ int err, perror, immutable;
+
+ pthread_rwlock_rdlock(&mmio_rwlock);
+ /*
+ * First check the per-vCPU cache
+ */
+ if (mmio_hint[vcpu] &&
+ paddr >= mmio_hint[vcpu]->mr_base &&
+ paddr <= mmio_hint[vcpu]->mr_end) {
+ entry = mmio_hint[vcpu];
+ } else
+ entry = NULL;
+
+ if (entry == NULL) {
+ if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
+ /* Update the per-vCPU cache */
+ mmio_hint[vcpu] = entry;
+ } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+ return (ESRCH);
+ }
+ }
+
+ assert(entry != NULL);
+
+ /*
+ * An 'immutable' memory range is guaranteed to be never removed
+ * so there is no need to hold 'mmio_rwlock' while calling the
+ * handler.
+ *
+ * XXX writes to the PCIR_COMMAND register can cause register_mem()
+ * to be called. If the guest is using PCI extended config space
+ * to modify the PCIR_COMMAND register then register_mem() can
+ * deadlock on 'mmio_rwlock'. However by registering the extended
+ * config space window as 'immutable' the deadlock can be avoided.
+ */
+ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+ if (immutable) {
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+ }
+
+ err = cb(ctx, vcpu, paddr, &entry->mr_param, arg);
+
+ if (!immutable) {
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+ }
+
+
+ return (err);
+}
+
+struct emulate_mem_args {
+ struct vie *vie;
+ struct vm_guest_paging *paging;
+};
+
+static int
+emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr,
+ void *arg)
+{
+ struct emulate_mem_args *ema;
+
+ ema = arg;
+ return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging,
+ mem_read, mem_write, mr));
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging)
+
+{
+ struct emulate_mem_args ema;
+
+ ema.vie = vie;
+ ema.paging = paging;
+ return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema));
+}
+
+struct read_mem_args {
+ uint64_t *rval;
+ int size;
+};
+
+static int
+read_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr,
+ void *arg)
+{
+ struct read_mem_args *rma;
+
+ rma = arg;
+ return (mr->handler(ctx, vcpu, MEM_F_READ, paddr, rma->size,
+ rma->rval, mr->arg1, mr->arg2));
+}
+
+int
+read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size)
+{
+ struct read_mem_args rma;
+
+ rma.rval = rval;
+ rma.size = size;
+ return (access_memory(ctx, vcpu, gpa, read_mem_cb, &rma));
+}
+
+static int
+register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
+{
+ struct mmio_rb_range *entry, *mrp;
+ int err, perror;
+
+ err = 0;
+
+ mrp = malloc(sizeof(struct mmio_rb_range));
+ if (mrp == NULL) {
+ warn("%s: couldn't allocate memory for mrp\n",
+ __func__);
+ err = ENOMEM;
+ } else {
+ mrp->mr_param = *memp;
+ mrp->mr_base = memp->base;
+ mrp->mr_end = memp->base + memp->size - 1;
+ pthread_rwlock_wrlock(&mmio_rwlock);
+ if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
+ err = mmio_rb_add(rbt, mrp);
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+ if (err)
+ free(mrp);
+ }
+
+ return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+
+ return (register_mem_int(&mmio_rb_root, memp));
+}
+
+int
+register_mem_fallback(struct mem_range *memp)
+{
+
+ return (register_mem_int(&mmio_rb_fallback, memp));
+}
+
+int
+unregister_mem(struct mem_range *memp)
+{
+ struct mem_range *mr;
+ struct mmio_rb_range *entry = NULL;
+ int err, perror, i;
+
+ pthread_rwlock_wrlock(&mmio_rwlock);
+ err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
+ if (err == 0) {
+ mr = &entry->mr_param;
+ assert(mr->name == memp->name);
+ assert(mr->base == memp->base && mr->size == memp->size);
+ assert((mr->flags & MEM_F_IMMUTABLE) == 0);
+ RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
+
+ /* flush Per-vCPU cache */
+ for (i=0; i < VM_MAXCPU; i++) {
+ if (mmio_hint[i] == entry)
+ mmio_hint[i] = NULL;
+ }
+ }
+ perror = pthread_rwlock_unlock(&mmio_rwlock);
+ assert(perror == 0);
+
+ if (entry)
+ free(entry);
+
+ return (err);
+}
+
+void
+init_mem(void)
+{
+
+ RB_INIT(&mmio_rb_root);
+ RB_INIT(&mmio_rb_fallback);
+ pthread_rwlock_init(&mmio_rwlock, NULL);
+}
diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h
new file mode 100644
index 0000000000..596c0b0cf3
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.h
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEM_H_
+#define _MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+ const char *name;
+ int flags;
+ mem_func_t handler;
+ void *arg1;
+ long arg2;
+ uint64_t base;
+ uint64_t size;
+};
+#define MEM_F_READ 0x1
+#define MEM_F_WRITE 0x2
+#define MEM_F_RW 0x3
+#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
+
+void init_mem(void);
+int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging);
+
+int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval,
+ int size);
+int register_mem(struct mem_range *memp);
+int register_mem_fallback(struct mem_range *memp);
+int unregister_mem(struct mem_range *memp);
+
+#endif /* _MEM_H_ */
diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c
new file mode 100644
index 0000000000..a258fd3047
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent.c
@@ -0,0 +1,680 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/event.h>
+#else
+#include <port.h>
+#include <sys/poll.h>
+#include <sys/siginfo.h>
+#include <sys/queue.h>
+#endif
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+#define MEV_ADD 1
+#define MEV_ENABLE 2
+#define MEV_DISABLE 3
+#define MEV_DEL_PENDING 4
+
+extern char *vmname;
+
+static pthread_t mevent_tid;
+static int mevent_timid = 43;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ void (*me_func)(int, enum ev_type, void *);
+#define me_msecs me_fd
+ int me_fd;
+#ifdef __FreeBSD__
+ int me_timid;
+#else
+ timer_t me_timid;
+#endif
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state;
+ int me_closefd;
+#ifndef __FreeBSD__
+ port_notify_t me_notify;
+ struct sigevent me_sigev;
+ boolean_t me_auto_requeue;
+#endif
+ LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c;
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+#ifdef __FreeBSD__
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ if (mevp->me_type == EVF_TIMER)
+ retval = EVFILT_TIMER;
+
+ if (mevp->me_type == EVF_SIGNAL)
+ retval = EVFILT_SIGNAL;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int ret;
+
+ switch (mevp->me_state) {
+ case MEV_ADD:
+ ret = EV_ADD; /* implicitly enabled */
+ break;
+ case MEV_ENABLE:
+ ret = EV_ENABLE;
+ break;
+ case MEV_DISABLE:
+ ret = EV_DISABLE;
+ break;
+ case MEV_DEL_PENDING:
+ ret = EV_DELETE;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ /* XXX nothing yet, perhaps EV_EOF for reads ? */
+ return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ if (mevp->me_type == EVF_TIMER) {
+ kev[i].ident = mevp->me_timid;
+ kev[i].data = mevp->me_msecs;
+ } else {
+ kev[i].ident = mevp->me_fd;
+ kev[i].data = 0;
+ }
+ kev[i].filter = mevent_kq_filter(mevp);
+ kev[i].flags = mevent_kq_flags(mevp);
+ kev[i].fflags = mevent_kq_fflags(mevp);
+ kev[i].udata = mevp;
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+ }
+}
+
+#else /* __FreeBSD__ */
+
+static void
+mevent_update_one(struct mevent *mevp)
+{
+ int portfd = mevp->me_notify.portnfy_port;
+
+ switch (mevp->me_type) {
+ case EVF_READ:
+ case EVF_WRITE:
+ mevp->me_auto_requeue = B_FALSE;
+
+ switch (mevp->me_state) {
+ case MEV_ADD:
+ case MEV_ENABLE:
+ {
+ int events;
+
+ events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT;
+
+ if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
+ events, mevp) != 0) {
+ (void) fprintf(stderr,
+ "port_associate fd %d %p failed: %s\n",
+ mevp->me_fd, mevp, strerror(errno));
+ }
+ return;
+ }
+ case MEV_DISABLE:
+ case MEV_DEL_PENDING:
+ /*
+ * A disable that comes in while an event is being
+ * handled will result in an ENOENT.
+ */
+ if (port_dissociate(portfd, PORT_SOURCE_FD,
+ mevp->me_fd) != 0 && errno != ENOENT) {
+ (void) fprintf(stderr, "port_dissociate "
+ "portfd %d fd %d mevp %p failed: %s\n",
+ portfd, mevp->me_fd, mevp, strerror(errno));
+ }
+ return;
+ default:
+ goto abort;
+ }
+
+ case EVF_TIMER:
+ mevp->me_auto_requeue = B_TRUE;
+
+ switch (mevp->me_state) {
+ case MEV_ADD:
+ case MEV_ENABLE:
+ {
+ struct itimerspec it = { 0 };
+
+ mevp->me_sigev.sigev_notify = SIGEV_PORT;
+ mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
+
+ if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
+ &mevp->me_timid) != 0) {
+ (void) fprintf(stderr,
+ "timer_create failed: %s", strerror(errno));
+ return;
+ }
+
+ /* The first timeout */
+ it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
+ it.it_value.tv_nsec =
+ MSEC2NSEC(mevp->me_msecs % MILLISEC);
+ /* Repeat at the same interval */
+ it.it_interval = it.it_value;
+
+ if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
+ (void) fprintf(stderr, "timer_settime failed: "
+ "%s", strerror(errno));
+ }
+ return;
+ }
+ case MEV_DISABLE:
+ case MEV_DEL_PENDING:
+ if (timer_delete(mevp->me_timid) != 0) {
+ (void) fprintf(stderr, "timer_delete failed: "
+ "%s", strerror(errno));
+ }
+ return;
+ default:
+ goto abort;
+ }
+ default:
+ /* EVF_SIGNAL not yet implemented. */
+ goto abort;
+ }
+
+abort:
+ (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__,
+ mevp->me_type, mevp->me_state);
+ abort();
+}
+
+static void
+mevent_update_pending(int portfd)
+{
+ struct mevent *mevp, *tmpp;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ mevp->me_notify.portnfy_port = portfd;
+ mevp->me_notify.portnfy_user = mevp;
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ (void) close(mevp->me_fd);
+ mevp->me_fd = -1;
+ } else {
+ mevent_update_one(mevp);
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+ }
+
+ mevent_qunlock();
+}
+
+static void
+mevent_handle_pe(port_event_t *pe)
+{
+ struct mevent *mevp = pe->portev_user;
+
+ mevent_qunlock();
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+
+ mevent_qlock();
+ if (!mevp->me_cq && !mevp->me_auto_requeue) {
+ mevent_update_one(mevp);
+ }
+ mevent_qunlock();
+}
+#endif
+
+struct mevent *
+mevent_add(int tfd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *), void *param)
+{
+ struct mevent *lp, *mevp;
+
+ if (tfd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry, populate it, and add it to the change list.
+ */
+ mevp = calloc(1, sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ if (type == EVF_TIMER) {
+ mevp->me_msecs = tfd;
+ mevp->me_timid = mevent_timid++;
+ } else
+ mevp->me_fd = tfd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+
+ LIST_INSERT_HEAD(&change_head, mevp, me_list);
+ mevp->me_cq = 1;
+ mevp->me_state = MEV_ADD;
+ mevent_notify();
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+ /*
+ * It's not possible to enable/disable a deleted event
+ */
+ if (evp->me_state == MEV_DEL_PENDING)
+ return (EINVAL);
+
+ /*
+ * No update needed if state isn't changing
+ */
+ if (evp->me_state == newstate)
+ return (0);
+
+ mevent_qlock();
+
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = MEV_DEL_PENDING;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+
+ pthread_set_name_np(mevent_tid, "mevent");
+}
+
+void
+mevent_dispatch(void)
+{
+#ifdef __FreeBSD__
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int mfd;
+ int numev;
+#else
+ struct mevent *pipev;
+ int portfd;
+#endif
+ int ret;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ mevent_tid = pthread_self();
+ mevent_set_name();
+
+#ifdef __FreeBSD__
+ mfd = kqueue();
+ assert(mfd > 0);
+#else
+ portfd = port_create();
+ assert(portfd >= 0);
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_KQUEUE);
+ if (caph_rights_limit(mfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+#ifdef __FreeBSD__
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(mfd, changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1 && errno != EINTR) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+
+#else /* __FreeBSD__ */
+ port_event_t pev;
+
+ /* Handle any pending updates */
+ mevent_update_pending(portfd);
+
+ /* Block awaiting events */
+ ret = port_get(portfd, &pev, NULL);
+ if (ret != 0 && errno != EINTR) {
+ perror("Error return from port_get");
+ continue;
+ }
+
+ /* Handle reported event */
+ mevent_handle_pe(&pev);
+#endif /* __FreeBSD__ */
+ }
+}
diff --git a/usr/src/cmd/bhyve/mevent.h b/usr/src/cmd/bhyve/mevent.h
new file mode 100644
index 0000000000..e6b96f0a7c
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent.h
@@ -0,0 +1,53 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE,
+ EVF_TIMER,
+ EVF_SIGNAL
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *),
+ void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c
new file mode 100644
index 0000000000..4da3adb5ae
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent_test.c
@@ -0,0 +1,282 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ * cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/stdint.h>
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
+#include <sys/socket.h>
+#include <netinet/in.h>
+#ifdef __FreeBSD__
+#include <machine/cpufunc.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "mevent.h"
+
+#define TEST_PORT 4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+static struct mevent *tevp;
+
+char *vmname = "test vm";
+
+
+#define MEVENT_ECHO
+
+/* Number of timer events to capture */
+#define TEVSZ 4096
+uint64_t tevbuf[TEVSZ];
+
+static void
+timer_print(void)
+{
+ uint64_t min, max, diff, sum;
+#ifdef __FreeBSD__
+ uint64_t tsc_freq;
+ size_t len;
+#endif
+ int j;
+
+ min = UINT64_MAX;
+ max = 0;
+ sum = 0;
+
+#ifdef __FreeBSD__
+ len = sizeof(tsc_freq);
+ sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0);
+#endif
+
+ for (j = 1; j < TEVSZ; j++) {
+#ifdef __FreeBSD__
+ /* Convert a tsc diff into microseconds */
+ diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq;
+#else
+ diff = (tevbuf[j] - tevbuf[j-1]) / 1000;
+#endif
+ sum += diff;
+ if (min > diff)
+ min = diff;
+ if (max < diff)
+ max = diff;
+ }
+
+ printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max,
+ sum/(TEVSZ - 1));
+}
+
+static void
+timer_callback(int fd, enum ev_type type, void *param)
+{
+ static int i;
+
+ if (i >= TEVSZ)
+ abort();
+
+#ifdef __FreeBSD__
+ tevbuf[i++] = rdtsc();
+#else
+ tevbuf[i++] = gethrtime();
+#endif
+
+ if (i == TEVSZ) {
+ mevent_delete(tevp);
+ timer_print();
+ }
+}
+
+
+#ifdef MEVENT_ECHO
+struct esync {
+ pthread_mutex_t e_mt;
+ pthread_cond_t e_cond;
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+ struct esync *sync = param;
+
+ pthread_mutex_lock(&sync->e_mt);
+ pthread_cond_signal(&sync->e_cond);
+ pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+ struct esync sync;
+ struct mevent *mev;
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ pthread_mutex_init(&sync.e_mt, NULL);
+ pthread_cond_init(&sync.e_cond, NULL);
+
+ pthread_mutex_lock(&sync.e_mt);
+
+ mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+ if (mev == NULL) {
+ printf("Could not allocate echoer event\n");
+ exit(4);
+ }
+
+ while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+ len = read(fd, buf, sizeof(buf));
+ if (len > 0) {
+ write(fd, buf, len);
+ write(0, buf, len);
+ } else {
+ break;
+ }
+ }
+
+ mevent_delete_close(mev);
+
+ pthread_mutex_unlock(&sync.e_mt);
+ pthread_mutex_destroy(&sync.e_mt);
+ pthread_cond_destroy(&sync.e_cond);
+
+ return (NULL);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ while ((len = read(fd, buf, sizeof(buf))) > 0) {
+ write(1, buf, len);
+ }
+
+ return (NULL);
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+ pthread_mutex_lock(&accept_mutex);
+ pthread_cond_signal(&accept_condvar);
+ pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+ struct sockaddr_in sin;
+ pthread_t tid;
+ int news;
+ int s;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("cannot create socket");
+ exit(4);
+ }
+
+#ifdef __FreeBSD__
+ sin.sin_len = sizeof(sin);
+#endif
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(TEST_PORT);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("cannot bind socket");
+ exit(4);
+ }
+
+ if (listen(s, 1) < 0) {
+ perror("cannot listen socket");
+ exit(4);
+ }
+
+ (void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+ pthread_mutex_lock(&accept_mutex);
+
+ while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+ news = accept(s, NULL, NULL);
+ if (news < 0) {
+ perror("accept error");
+ } else {
+ static int first = 1;
+
+ if (first) {
+ /*
+ * Start a timer
+ */
+ first = 0;
+ tevp = mevent_add(1, EVF_TIMER, timer_callback,
+ NULL);
+ }
+
+ printf("incoming connection, spawning thread\n");
+ pthread_create(&tid, NULL, echoer,
+ (void *)(uintptr_t)news);
+ }
+ }
+
+ return (NULL);
+}
+
+int
+main()
+{
+ pthread_t tid;
+
+ pthread_create(&tid, NULL, acceptor, NULL);
+
+ mevent_dispatch();
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c
new file mode 100644
index 0000000000..e78f88f074
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.c
@@ -0,0 +1,379 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+
+#define MPTABLE_BASE 0xE0000
+
+/* floating pointer length + maximum length of configuration table */
+#define MPTABLE_MAX_LENGTH (65536 + 16)
+
+#define LAPIC_PADDR 0xFEE00000
+#define LAPIC_VERSION 16
+
+#define IOAPIC_PADDR 0xFEC00000
+#define IOAPIC_VERSION 0x11
+
+#define MP_SPECREV 4
+#define MPFP_SIG "_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG "PCMP"
+#define MPCH_OEMID "BHyVe "
+#define MPCH_OEMID_LEN 8
+#define MPCH_PRODID "Hypervisor "
+#define MPCH_PRODID_LEN 12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL 26
+#define MPEP_SIG_STEPPING 5
+#define MPEP_SIG \
+ ((MPEP_SIG_FAMILY << 8) | \
+ (MPEP_SIG_MODEL << 4) | \
+ (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Number of local intr entries */
+#define MPEII_NUM_LOCAL_IRQ 2
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES 2
+#define MPE_BUSNAME_LEN 6
+#define MPE_BUSNAME_ISA "ISA "
+#define MPE_BUSNAME_PCI "PCI "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+ uint8_t *bytes;
+ uint8_t sum;
+
+ for(bytes = base, sum = 0; len > 0; len--) {
+ sum += *bytes++;
+ }
+
+ return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+ memset(mpfp, 0, sizeof(*mpfp));
+ memcpy(mpfp->signature, MPFP_SIG, 4);
+ mpfp->pap = gpa + sizeof(*mpfp);
+ mpfp->length = 1;
+ mpfp->spec_rev = MP_SPECREV;
+ mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+ memset(mpch, 0, sizeof(*mpch));
+ memcpy(mpch->signature, MPCH_SIG, 4);
+ mpch->spec_rev = MP_SPECREV;
+ memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+ memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+ mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu)
+{
+ int i;
+
+ for (i = 0; i < ncpu; i++) {
+ memset(mpep, 0, sizeof(*mpep));
+ mpep->type = MPCT_ENTRY_PROCESSOR;
+ mpep->apic_id = i; // XXX
+ mpep->apic_version = LAPIC_VERSION;
+ mpep->cpu_flags = PROCENTRY_FLAG_EN;
+ if (i == 0)
+ mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+ mpep->cpu_signature = MPEP_SIG;
+ mpep->feature_flags = MPEP_FEATURES;
+ mpep++;
+ }
+}
+
+static void
+mpt_build_localint_entries(int_entry_ptr mpie)
+{
+
+ /* Hardcode LINT0 as ExtINT on all CPUs. */
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_LOCAL_INT;
+ mpie->int_type = INTENTRY_TYPE_EXTINT;
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+ INTENTRY_FLAGS_TRIGGER_CONFORM;
+ mpie->dst_apic_id = 0xff;
+ mpie->dst_apic_int = 0;
+ mpie++;
+
+ /* Hardcode LINT1 as NMI on all CPUs. */
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_LOCAL_INT;
+ mpie->int_type = INTENTRY_TYPE_NMI;
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+ INTENTRY_FLAGS_TRIGGER_CONFORM;
+ mpie->dst_apic_id = 0xff;
+ mpie->dst_apic_int = 1;
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = 0;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+ mpeb++;
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = 1;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+ memset(mpei, 0, sizeof(*mpei));
+ mpei->type = MPCT_ENTRY_IOAPIC;
+ mpei->apic_id = id;
+ mpei->apic_version = IOAPIC_VERSION;
+ mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+ mpei->apic_address = IOAPIC_PADDR;
+}
+
+static int
+mpt_count_ioint_entries(void)
+{
+ int bus, count;
+
+ count = 0;
+ for (bus = 0; bus <= PCI_BUSMAX; bus++)
+ count += pci_count_lintr(bus);
+
+ /*
+ * Always include entries for the first 16 pins along with a entry
+ * for each active PCI INTx pin.
+ */
+ return (16 + count);
+}
+
+static void
+mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+ int_entry_ptr *mpiep, mpie;
+
+ mpiep = arg;
+ mpie = *mpiep;
+ memset(mpie, 0, sizeof(*mpie));
+
+ /*
+ * This is always after another I/O interrupt entry, so cheat
+ * and fetch the I/O APIC ID from the prior entry.
+ */
+ mpie->type = MPCT_ENTRY_INT;
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_id = bus;
+ mpie->src_bus_irq = slot << 2 | (pin - 1);
+ mpie->dst_apic_id = mpie[-1].dst_apic_id;
+ mpie->dst_apic_int = ioapic_irq;
+
+ *mpiep = mpie + 1;
+}
+
+static void
+mpt_build_ioint_entries(int_entry_ptr mpie, int id)
+{
+ int pin, bus;
+
+ /*
+ * The following config is taken from kernel mptable.c
+ * mptable_parse_default_config_ints(...), for now
+ * just use the default config, tweek later if needed.
+ */
+
+ /* First, generate the first 16 pins. */
+ for (pin = 0; pin < 16; pin++) {
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_INT;
+ mpie->src_bus_id = 1;
+ mpie->dst_apic_id = id;
+
+ /*
+ * All default configs route IRQs from bus 0 to the first 16
+ * pins of the first I/O APIC with an APIC ID of 2.
+ */
+ mpie->dst_apic_int = pin;
+ switch (pin) {
+ case 0:
+ /* Pin 0 is an ExtINT pin. */
+ mpie->int_type = INTENTRY_TYPE_EXTINT;
+ break;
+ case 2:
+ /* IRQ 0 is routed to pin 2. */
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = 0;
+ break;
+ case SCI_INT:
+ /* ACPI SCI is level triggered and active-lo. */
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
+ INTENTRY_FLAGS_TRIGGER_LEVEL;
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = SCI_INT;
+ break;
+ default:
+ /* All other pins are identity mapped. */
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = pin;
+ break;
+ }
+ mpie++;
+ }
+
+ /* Next, generate entries for any PCI INTx interrupts. */
+ for (bus = 0; bus <= PCI_BUSMAX; bus++)
+ pci_walk_lintr(bus, mpt_generate_pci_int, &mpie);
+}
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu)
+{
+ mpcth_t mpch;
+ bus_entry_ptr mpeb;
+ io_apic_entry_ptr mpei;
+ proc_entry_ptr mpep;
+ mpfps_t mpfp;
+ int_entry_ptr mpie;
+ int ioints, bus;
+ char *curraddr;
+ char *startaddr;
+
+ startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH);
+ if (startaddr == NULL) {
+ fprintf(stderr, "mptable requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ /*
+ * There is no way to advertise multiple PCI hierarchies via MPtable
+ * so require that there is no PCI hierarchy with a non-zero bus
+ * number.
+ */
+ for (bus = 1; bus <= PCI_BUSMAX; bus++) {
+ if (pci_bus_configured(bus)) {
+ fprintf(stderr, "MPtable is incompatible with "
+ "multiple PCI hierarchies.\r\n");
+ fprintf(stderr, "MPtable generation can be disabled "
+ "by passing the -Y option to bhyve(8).\r\n");
+ return (EINVAL);
+ }
+ }
+
+ curraddr = startaddr;
+ mpfp = (mpfps_t)curraddr;
+ mpt_build_mpfp(mpfp, MPTABLE_BASE);
+ curraddr += sizeof(*mpfp);
+
+ mpch = (mpcth_t)curraddr;
+ mpt_build_mpch(mpch);
+ curraddr += sizeof(*mpch);
+
+ mpep = (proc_entry_ptr)curraddr;
+ mpt_build_proc_entries(mpep, ncpu);
+ curraddr += sizeof(*mpep) * ncpu;
+ mpch->entry_count += ncpu;
+
+ mpeb = (bus_entry_ptr) curraddr;
+ mpt_build_bus_entries(mpeb);
+ curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+ mpch->entry_count += MPE_NUM_BUSES;
+
+ mpei = (io_apic_entry_ptr)curraddr;
+ mpt_build_ioapic_entries(mpei, 0);
+ curraddr += sizeof(*mpei);
+ mpch->entry_count++;
+
+ mpie = (int_entry_ptr) curraddr;
+ ioints = mpt_count_ioint_entries();
+ mpt_build_ioint_entries(mpie, 0);
+ curraddr += sizeof(*mpie) * ioints;
+ mpch->entry_count += ioints;
+
+ mpie = (int_entry_ptr)curraddr;
+ mpt_build_localint_entries(mpie);
+ curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ;
+ mpch->entry_count += MPEII_NUM_LOCAL_IRQ;
+
+ if (oem_tbl_start) {
+ mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+ mpch->oem_table_size = oem_tbl_size;
+ memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+ }
+
+ mpch->base_table_length = curraddr - (char *)mpch;
+ mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h
new file mode 100644
index 0000000000..ebc8d85ea8
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.h
@@ -0,0 +1,37 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int mptable_build(struct vmctx *ctx, int ncpu);
+void mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c
new file mode 100644
index 0000000000..1e3feffcc2
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_ahci.c
@@ -0,0 +1,2485 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org>
+ * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+#include <sys/ata.h>
+#include <sys/endian.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <inttypes.h>
+#include <md5.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "ahci.h"
+#include "block_if.h"
+
+#define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */
+#define MAX_PORTS 32 /* AHCI supports 32 ports */
+
+#define PxSIG_ATA 0x00000101 /* ATA drive */
+#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */
+
+enum sata_fis_type {
+ FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */
+ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */
+ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */
+ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */
+ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */
+ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */
+ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */
+ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */
+};
+
+/*
+ * SCSI opcodes
+ */
+#define TEST_UNIT_READY 0x00
+#define REQUEST_SENSE 0x03
+#define INQUIRY 0x12
+#define START_STOP_UNIT 0x1B
+#define PREVENT_ALLOW 0x1E
+#define READ_CAPACITY 0x25
+#define READ_10 0x28
+#define POSITION_TO_ELEMENT 0x2B
+#define READ_TOC 0x43
+#define GET_EVENT_STATUS_NOTIFICATION 0x4A
+#define MODE_SENSE_10 0x5A
+#define REPORT_LUNS 0xA0
+#define READ_12 0xA8
+#define READ_CD 0xBE
+
+/*
+ * SCSI mode page codes
+ */
+#define MODEPAGE_RW_ERROR_RECOVERY 0x01
+#define MODEPAGE_CD_CAPABILITIES 0x2A
+
+/*
+ * ATA commands
+ */
+#define ATA_SF_ENAB_SATA_SF 0x10
+#define ATA_SATA_SF_AN 0x05
+#define ATA_SF_DIS_SATA_SF 0x90
+
+/*
+ * Debug printf
+ */
+#ifdef AHCI_DEBUG
+static FILE *dbg;
+#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
+#else
+#define DPRINTF(format, arg...)
+#endif
+#define WPRINTF(format, arg...) printf(format, ##arg)
+
+#define AHCI_PORT_IDENT 20 + 1
+
+struct ahci_ioreq {
+ struct blockif_req io_req;
+ struct ahci_port *io_pr;
+ STAILQ_ENTRY(ahci_ioreq) io_flist;
+ TAILQ_ENTRY(ahci_ioreq) io_blist;
+ uint8_t *cfis;
+ uint32_t len;
+ uint32_t done;
+ int slot;
+ int more;
+};
+
+struct ahci_port {
+ struct blockif_ctxt *bctx;
+ struct pci_ahci_softc *pr_sc;
+ uint8_t *cmd_lst;
+ uint8_t *rfis;
+ char ident[AHCI_PORT_IDENT];
+ int port;
+ int atapi;
+ int reset;
+ int waitforclear;
+ int mult_sectors;
+ uint8_t xfermode;
+ uint8_t err_cfis[20];
+ uint8_t sense_key;
+ uint8_t asc;
+ u_int ccs;
+ uint32_t pending;
+
+ uint32_t clb;
+ uint32_t clbu;
+ uint32_t fb;
+ uint32_t fbu;
+ uint32_t is;
+ uint32_t ie;
+ uint32_t cmd;
+ uint32_t unused0;
+ uint32_t tfd;
+ uint32_t sig;
+ uint32_t ssts;
+ uint32_t sctl;
+ uint32_t serr;
+ uint32_t sact;
+ uint32_t ci;
+ uint32_t sntf;
+ uint32_t fbs;
+
+ /*
+ * i/o request info
+ */
+ struct ahci_ioreq *ioreq;
+ int ioqsz;
+ STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
+ TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
+};
+
+struct ahci_cmd_hdr {
+ uint16_t flags;
+ uint16_t prdtl;
+ uint32_t prdbc;
+ uint64_t ctba;
+ uint32_t reserved[4];
+};
+
+struct ahci_prdt_entry {
+ uint64_t dba;
+ uint32_t reserved;
+#define DBCMASK 0x3fffff
+ uint32_t dbc;
+};
+
+struct pci_ahci_softc {
+ struct pci_devinst *asc_pi;
+ pthread_mutex_t mtx;
+ int ports;
+ uint32_t cap;
+ uint32_t ghc;
+ uint32_t is;
+ uint32_t pi;
+ uint32_t vs;
+ uint32_t ccc_ctl;
+ uint32_t ccc_pts;
+ uint32_t em_loc;
+ uint32_t em_ctl;
+ uint32_t cap2;
+ uint32_t bohc;
+ uint32_t lintr;
+ struct ahci_port port[MAX_PORTS];
+};
+#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx)
+
+static void ahci_handle_port(struct ahci_port *p);
+
+static inline void lba_to_msf(uint8_t *buf, int lba)
+{
+ lba += 150;
+ buf[0] = (lba / 75) / 60;
+ buf[1] = (lba / 75) % 60;
+ buf[2] = lba % 75;
+}
+
+/*
+ * Generate HBA interrupts on global IS register write.
+ */
+static void
+ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
+{
+ struct pci_devinst *pi = sc->asc_pi;
+ struct ahci_port *p;
+ int i, nmsg;
+ uint32_t mmask;
+
+ /* Update global IS from PxIS/PxIE. */
+ for (i = 0; i < sc->ports; i++) {
+ p = &sc->port[i];
+ if (p->is & p->ie)
+ sc->is |= (1 << i);
+ }
+ DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is);
+
+ /* If there is nothing enabled -- clear legacy interrupt and exit. */
+ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
+ if (sc->lintr) {
+ pci_lintr_deassert(pi);
+ sc->lintr = 0;
+ }
+ return;
+ }
+
+ /* If there is anything and no MSI -- assert legacy interrupt. */
+ nmsg = pci_msi_maxmsgnum(pi);
+ if (nmsg == 0) {
+ if (!sc->lintr) {
+ sc->lintr = 1;
+ pci_lintr_assert(pi);
+ }
+ return;
+ }
+
+ /* Assert respective MSIs for ports that were touched. */
+ for (i = 0; i < nmsg; i++) {
+ if (sc->ports <= nmsg || i < nmsg - 1)
+ mmask = 1 << i;
+ else
+ mmask = 0xffffffff << i;
+ if (sc->is & mask && mmask & mask)
+ pci_generate_msi(pi, i);
+ }
+}
+
+/*
+ * Generate HBA interrupt on specific port event.
+ */
+static void
+ahci_port_intr(struct ahci_port *p)
+{
+ struct pci_ahci_softc *sc = p->pr_sc;
+ struct pci_devinst *pi = sc->asc_pi;
+ int nmsg;
+
+ DPRINTF("%s(%d) %08x/%08x %08x\n", __func__,
+ p->port, p->is, p->ie, sc->is);
+
+ /* If there is nothing enabled -- we are done. */
+ if ((p->is & p->ie) == 0)
+ return;
+
+ /* In case of non-shared MSI always generate interrupt. */
+ nmsg = pci_msi_maxmsgnum(pi);
+ if (sc->ports <= nmsg || p->port < nmsg - 1) {
+ sc->is |= (1 << p->port);
+ if ((sc->ghc & AHCI_GHC_IE) == 0)
+ return;
+ pci_generate_msi(pi, p->port);
+ return;
+ }
+
+ /* If IS for this port is already set -- do nothing. */
+ if (sc->is & (1 << p->port))
+ return;
+
+ sc->is |= (1 << p->port);
+
+ /* If interrupts are enabled -- generate one. */
+ if ((sc->ghc & AHCI_GHC_IE) == 0)
+ return;
+ if (nmsg > 0) {
+ pci_generate_msi(pi, nmsg - 1);
+ } else if (!sc->lintr) {
+ sc->lintr = 1;
+ pci_lintr_assert(pi);
+ }
+}
+
+static void
+ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
+{
+ int offset, len, irq;
+
+ if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
+ return;
+
+ switch (ft) {
+ case FIS_TYPE_REGD2H:
+ offset = 0x40;
+ len = 20;
+ irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
+ break;
+ case FIS_TYPE_SETDEVBITS:
+ offset = 0x58;
+ len = 8;
+ irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
+ break;
+ case FIS_TYPE_PIOSETUP:
+ offset = 0x20;
+ len = 20;
+ irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
+ break;
+ default:
+ WPRINTF("unsupported fis type %d\n", ft);
+ return;
+ }
+ if (fis[2] & ATA_S_ERROR) {
+ p->waitforclear = 1;
+ irq |= AHCI_P_IX_TFE;
+ }
+ memcpy(p->rfis + offset, fis, len);
+ if (irq) {
+ if (~p->is & irq) {
+ p->is |= irq;
+ ahci_port_intr(p);
+ }
+ }
+}
+
+static void
+ahci_write_fis_piosetup(struct ahci_port *p)
+{
+ uint8_t fis[20];
+
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_PIOSETUP;
+ ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
+}
+
+static void
+ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
+{
+ uint8_t fis[8];
+ uint8_t error;
+
+ error = (tfd >> 8) & 0xff;
+ tfd &= 0x77;
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_SETDEVBITS;
+ fis[1] = (1 << 6);
+ fis[2] = tfd;
+ fis[3] = error;
+ if (fis[2] & ATA_S_ERROR) {
+ p->err_cfis[0] = slot;
+ p->err_cfis[2] = tfd;
+ p->err_cfis[3] = error;
+ memcpy(&p->err_cfis[4], cfis + 4, 16);
+ } else {
+ *(uint32_t *)(fis + 4) = (1 << slot);
+ p->sact &= ~(1 << slot);
+ }
+ p->tfd &= ~0x77;
+ p->tfd |= tfd;
+ ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
+}
+
+static void
+ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
+{
+ uint8_t fis[20];
+ uint8_t error;
+
+ error = (tfd >> 8) & 0xff;
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_REGD2H;
+ fis[1] = (1 << 6);
+ fis[2] = tfd & 0xff;
+ fis[3] = error;
+ fis[4] = cfis[4];
+ fis[5] = cfis[5];
+ fis[6] = cfis[6];
+ fis[7] = cfis[7];
+ fis[8] = cfis[8];
+ fis[9] = cfis[9];
+ fis[10] = cfis[10];
+ fis[11] = cfis[11];
+ fis[12] = cfis[12];
+ fis[13] = cfis[13];
+ if (fis[2] & ATA_S_ERROR) {
+ p->err_cfis[0] = 0x80;
+ p->err_cfis[2] = tfd & 0xff;
+ p->err_cfis[3] = error;
+ memcpy(&p->err_cfis[4], cfis + 4, 16);
+ } else
+ p->ci &= ~(1 << slot);
+ p->tfd = tfd;
+ ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
+{
+ uint8_t fis[20];
+
+ p->tfd = ATA_S_READY | ATA_S_DSC;
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_REGD2H;
+ fis[1] = 0; /* No interrupt */
+ fis[2] = p->tfd; /* Status */
+ fis[3] = 0; /* No error */
+ p->ci &= ~(1 << slot);
+ ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_write_reset_fis_d2h(struct ahci_port *p)
+{
+ uint8_t fis[20];
+
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_REGD2H;
+ fis[3] = 1;
+ fis[4] = 1;
+ if (p->atapi) {
+ fis[5] = 0x14;
+ fis[6] = 0xeb;
+ }
+ fis[12] = 1;
+ ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_check_stopped(struct ahci_port *p)
+{
+ /*
+ * If we are no longer processing the command list and nothing
+ * is in-flight, clear the running bit, the current command
+ * slot, the command issue and active bits.
+ */
+ if (!(p->cmd & AHCI_P_CMD_ST)) {
+ if (p->pending == 0) {
+ p->ccs = 0;
+ p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
+ p->ci = 0;
+ p->sact = 0;
+ p->waitforclear = 0;
+ }
+ }
+}
+
+static void
+ahci_port_stop(struct ahci_port *p)
+{
+ struct ahci_ioreq *aior;
+ uint8_t *cfis;
+ int slot;
+ int error;
+
+ assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
+
+ TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
+ /*
+ * Try to cancel the outstanding blockif request.
+ */
+ error = blockif_cancel(p->bctx, &aior->io_req);
+ if (error != 0)
+ continue;
+
+ slot = aior->slot;
+ cfis = aior->cfis;
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED ||
+ cfis[2] == ATA_SEND_FPDMA_QUEUED)
+ p->sact &= ~(1 << slot); /* NCQ */
+ else
+ p->ci &= ~(1 << slot);
+
+ /*
+ * This command is now done.
+ */
+ p->pending &= ~(1 << slot);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+ }
+
+ ahci_check_stopped(p);
+}
+
+static void
+ahci_port_reset(struct ahci_port *pr)
+{
+ pr->serr = 0;
+ pr->sact = 0;
+ pr->xfermode = ATA_UDMA6;
+ pr->mult_sectors = 128;
+
+ if (!pr->bctx) {
+ pr->ssts = ATA_SS_DET_NO_DEVICE;
+ pr->sig = 0xFFFFFFFF;
+ pr->tfd = 0x7F;
+ return;
+ }
+ pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
+ if (pr->sctl & ATA_SC_SPD_MASK)
+ pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
+ else
+ pr->ssts |= ATA_SS_SPD_GEN3;
+ pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
+ if (!pr->atapi) {
+ pr->sig = PxSIG_ATA;
+ pr->tfd |= ATA_S_READY;
+ } else
+ pr->sig = PxSIG_ATAPI;
+ ahci_write_reset_fis_d2h(pr);
+}
+
+static void
+ahci_reset(struct pci_ahci_softc *sc)
+{
+ int i;
+
+ sc->ghc = AHCI_GHC_AE;
+ sc->is = 0;
+
+ if (sc->lintr) {
+ pci_lintr_deassert(sc->asc_pi);
+ sc->lintr = 0;
+ }
+
+ for (i = 0; i < sc->ports; i++) {
+ sc->port[i].ie = 0;
+ sc->port[i].is = 0;
+ sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
+ if (sc->port[i].bctx)
+ sc->port[i].cmd |= AHCI_P_CMD_CPS;
+ sc->port[i].sctl = 0;
+ ahci_port_reset(&sc->port[i]);
+ }
+}
+
+static void
+ata_string(uint8_t *dest, const char *src, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (*src)
+ dest[i ^ 1] = *src++;
+ else
+ dest[i ^ 1] = ' ';
+ }
+}
+
+static void
+atapi_string(uint8_t *dest, const char *src, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (*src)
+ dest[i] = *src++;
+ else
+ dest[i] = ' ';
+ }
+}
+
+/*
+ * Build up the iovec based on the PRDT, 'done' and 'len'.
+ */
+static void
+ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
+ struct ahci_prdt_entry *prdt, uint16_t prdtl)
+{
+ struct blockif_req *breq = &aior->io_req;
+ int i, j, skip, todo, left, extra;
+ uint32_t dbcsz;
+
+ /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
+ skip = aior->done;
+ left = aior->len - aior->done;
+ todo = 0;
+ for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
+ i++, prdt++) {
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ /* Skip already done part of the PRDT */
+ if (dbcsz <= skip) {
+ skip -= dbcsz;
+ continue;
+ }
+ dbcsz -= skip;
+ if (dbcsz > left)
+ dbcsz = left;
+ breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
+ prdt->dba + skip, dbcsz);
+ breq->br_iov[j].iov_len = dbcsz;
+ todo += dbcsz;
+ left -= dbcsz;
+ skip = 0;
+ j++;
+ }
+
+ /* If we got limited by IOV length, round I/O down to sector size. */
+ if (j == BLOCKIF_IOV_MAX) {
+ extra = todo % blockif_sectsz(p->bctx);
+ todo -= extra;
+ assert(todo > 0);
+ while (extra > 0) {
+ if (breq->br_iov[j - 1].iov_len > extra) {
+ breq->br_iov[j - 1].iov_len -= extra;
+ break;
+ }
+ extra -= breq->br_iov[j - 1].iov_len;
+ j--;
+ }
+ }
+
+ breq->br_iovcnt = j;
+ breq->br_resid = todo;
+ aior->done += todo;
+ aior->more = (aior->done < aior->len && i < prdtl);
+}
+
+static void
+ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
+{
+ struct ahci_ioreq *aior;
+ struct blockif_req *breq;
+ struct ahci_prdt_entry *prdt;
+ struct ahci_cmd_hdr *hdr;
+ uint64_t lba;
+ uint32_t len;
+ int err, first, ncq, readop;
+
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ ncq = 0;
+ readop = 1;
+ first = (done == 0);
+
+ if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
+ cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
+ cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
+ cfis[2] == ATA_WRITE_FPDMA_QUEUED)
+ readop = 0;
+
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED) {
+ lba = ((uint64_t)cfis[10] << 40) |
+ ((uint64_t)cfis[9] << 32) |
+ ((uint64_t)cfis[8] << 24) |
+ ((uint64_t)cfis[6] << 16) |
+ ((uint64_t)cfis[5] << 8) |
+ cfis[4];
+ len = cfis[11] << 8 | cfis[3];
+ if (!len)
+ len = 65536;
+ ncq = 1;
+ } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
+ cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
+ cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
+ lba = ((uint64_t)cfis[10] << 40) |
+ ((uint64_t)cfis[9] << 32) |
+ ((uint64_t)cfis[8] << 24) |
+ ((uint64_t)cfis[6] << 16) |
+ ((uint64_t)cfis[5] << 8) |
+ cfis[4];
+ len = cfis[13] << 8 | cfis[12];
+ if (!len)
+ len = 65536;
+ } else {
+ lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
+ (cfis[5] << 8) | cfis[4];
+ len = cfis[12];
+ if (!len)
+ len = 256;
+ }
+ lba *= blockif_sectsz(p->bctx);
+ len *= blockif_sectsz(p->bctx);
+
+ /* Pull request off free list */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = len;
+ aior->done = done;
+ breq = &aior->io_req;
+ breq->br_offset = lba + done;
+ ahci_build_iov(p, aior, prdt, hdr->prdtl);
+
+ /* Mark this command in-flight. */
+ p->pending |= 1 << slot;
+
+ /* Stuff request onto busy list. */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ if (ncq && first)
+ ahci_write_fis_d2h_ncq(p, slot);
+
+ if (readop)
+ err = blockif_read(p->bctx, breq);
+ else
+ err = blockif_write(p->bctx, breq);
+ assert(err == 0);
+}
+
+static void
+ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ struct ahci_ioreq *aior;
+ struct blockif_req *breq;
+ int err;
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = 0;
+ aior->done = 0;
+ aior->more = 0;
+ breq = &aior->io_req;
+
+ /*
+ * Mark this command in-flight.
+ */
+ p->pending |= 1 << slot;
+
+ /*
+ * Stuff request onto busy list
+ */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ err = blockif_flush(p->bctx, breq);
+ assert(err == 0);
+}
+
+static inline void
+read_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
+ void *buf, int size)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ void *to;
+ int i, len;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ len = size;
+ to = buf;
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+ for (i = 0; i < hdr->prdtl && len; i++) {
+ uint8_t *ptr;
+ uint32_t dbcsz;
+ int sublen;
+
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
+ sublen = MIN(len, dbcsz);
+ memcpy(to, ptr, sublen);
+ len -= sublen;
+ to += sublen;
+ prdt++;
+ }
+}
+
+static void
+ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
+{
+ struct ahci_ioreq *aior;
+ struct blockif_req *breq;
+ uint8_t *entry;
+ uint64_t elba;
+ uint32_t len, elen;
+ int err, first, ncq;
+ uint8_t buf[512];
+
+ first = (done == 0);
+ if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
+ len = (uint16_t)cfis[13] << 8 | cfis[12];
+ len *= 512;
+ ncq = 0;
+ } else { /* ATA_SEND_FPDMA_QUEUED */
+ len = (uint16_t)cfis[11] << 8 | cfis[3];
+ len *= 512;
+ ncq = 1;
+ }
+ read_prdt(p, slot, cfis, buf, sizeof(buf));
+
+next:
+ entry = &buf[done];
+ elba = ((uint64_t)entry[5] << 40) |
+ ((uint64_t)entry[4] << 32) |
+ ((uint64_t)entry[3] << 24) |
+ ((uint64_t)entry[2] << 16) |
+ ((uint64_t)entry[1] << 8) |
+ entry[0];
+ elen = (uint16_t)entry[7] << 8 | entry[6];
+ done += 8;
+ if (elen == 0) {
+ if (done >= len) {
+ if (ncq) {
+ if (first)
+ ahci_write_fis_d2h_ncq(p, slot);
+ ahci_write_fis_sdb(p, slot, cfis,
+ ATA_S_READY | ATA_S_DSC);
+ } else {
+ ahci_write_fis_d2h(p, slot, cfis,
+ ATA_S_READY | ATA_S_DSC);
+ }
+ p->pending &= ~(1 << slot);
+ ahci_check_stopped(p);
+ if (!first)
+ ahci_handle_port(p);
+ return;
+ }
+ goto next;
+ }
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = len;
+ aior->done = done;
+ aior->more = (len != done);
+
+ breq = &aior->io_req;
+ breq->br_offset = elba * blockif_sectsz(p->bctx);
+ breq->br_resid = elen * blockif_sectsz(p->bctx);
+
+ /*
+ * Mark this command in-flight.
+ */
+ p->pending |= 1 << slot;
+
+ /*
+ * Stuff request onto busy list
+ */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ if (ncq && first)
+ ahci_write_fis_d2h_ncq(p, slot);
+
+ err = blockif_delete(p->bctx, breq);
+ assert(err == 0);
+}
+
+static inline void
+write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
+ void *buf, int size)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ void *from;
+ int i, len;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ len = size;
+ from = buf;
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+ for (i = 0; i < hdr->prdtl && len; i++) {
+ uint8_t *ptr;
+ uint32_t dbcsz;
+ int sublen;
+
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
+ sublen = MIN(len, dbcsz);
+ memcpy(ptr, from, sublen);
+ len -= sublen;
+ from += sublen;
+ prdt++;
+ }
+ hdr->prdbc = size - len;
+}
+
+static void
+ahci_checksum(uint8_t *buf, int size)
+{
+ int i;
+ uint8_t sum = 0;
+
+ for (i = 0; i < size - 1; i++)
+ sum += buf[i];
+ buf[size - 1] = 0x100 - sum;
+}
+
+static void
+ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ struct ahci_cmd_hdr *hdr;
+ uint32_t buf[128];
+ uint8_t *buf8 = (uint8_t *)buf;
+ uint16_t *buf16 = (uint16_t *)buf;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 ||
+ cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ return;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ if (cfis[4] == 0x00) { /* Log directory */
+ buf16[0x00] = 1; /* Version -- 1 */
+ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */
+ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */
+ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */
+ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis));
+ ahci_checksum(buf8, sizeof(buf));
+ } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */
+ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) {
+ buf[0x00] = 1; /* SFQ DSM supported */
+ buf[0x01] = 1; /* SFQ DSM TRIM supported */
+ }
+ } else {
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ return;
+ }
+
+ if (cfis[2] == ATA_READ_LOG_EXT)
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+}
+
+static void
+handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ struct ahci_cmd_hdr *hdr;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ if (p->atapi || hdr->prdtl == 0) {
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ } else {
+ uint16_t buf[256];
+ uint64_t sectors;
+ int sectsz, psectsz, psectoff, candelete, ro;
+ uint16_t cyl;
+ uint8_t sech, heads;
+
+ ro = blockif_is_ro(p->bctx);
+ candelete = blockif_candelete(p->bctx);
+ sectsz = blockif_sectsz(p->bctx);
+ sectors = blockif_size(p->bctx) / sectsz;
+ blockif_chs(p->bctx, &cyl, &heads, &sech);
+ blockif_psectsz(p->bctx, &psectsz, &psectoff);
+ memset(buf, 0, sizeof(buf));
+ buf[0] = 0x0040;
+ buf[1] = cyl;
+ buf[3] = heads;
+ buf[6] = sech;
+ ata_string((uint8_t *)(buf+10), p->ident, 20);
+ ata_string((uint8_t *)(buf+23), "001", 8);
+ ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
+ buf[47] = (0x8000 | 128);
+ buf[48] = 0;
+ buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
+ buf[50] = (1 << 14);
+ buf[53] = (1 << 1 | 1 << 2);
+ if (p->mult_sectors)
+ buf[59] = (0x100 | p->mult_sectors);
+ if (sectors <= 0x0fffffff) {
+ buf[60] = sectors;
+ buf[61] = (sectors >> 16);
+ } else {
+ buf[60] = 0xffff;
+ buf[61] = 0x0fff;
+ }
+ buf[63] = 0x7;
+ if (p->xfermode & ATA_WDMA0)
+ buf[63] |= (1 << ((p->xfermode & 7) + 8));
+ buf[64] = 0x3;
+ buf[65] = 120;
+ buf[66] = 120;
+ buf[67] = 120;
+ buf[68] = 120;
+ buf[69] = 0;
+ buf[75] = 31;
+ buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
+ ATA_SUPPORT_NCQ);
+ buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
+ (p->ssts & ATA_SS_SPD_MASK) >> 3);
+ buf[80] = 0x3f0;
+ buf[81] = 0x28;
+ buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
+ ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+ buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+ ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
+ buf[84] = (1 << 14);
+ buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
+ ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+ buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+ ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
+ buf[87] = (1 << 14);
+ buf[88] = 0x7f;
+ if (p->xfermode & ATA_UDMA0)
+ buf[88] |= (1 << ((p->xfermode & 7) + 8));
+ buf[100] = sectors;
+ buf[101] = (sectors >> 16);
+ buf[102] = (sectors >> 32);
+ buf[103] = (sectors >> 48);
+ if (candelete && !ro) {
+ buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
+ buf[105] = 1;
+ buf[169] = ATA_SUPPORT_DSM_TRIM;
+ }
+ buf[106] = 0x4000;
+ buf[209] = 0x4000;
+ if (psectsz > sectsz) {
+ buf[106] |= 0x2000;
+ buf[106] |= ffsl(psectsz / sectsz) - 1;
+ buf[209] |= (psectoff / sectsz);
+ }
+ if (sectsz > 512) {
+ buf[106] |= 0x1000;
+ buf[117] = sectsz / 2;
+ buf[118] = ((sectsz / 2) >> 16);
+ }
+ buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+ buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+ buf[222] = 0x1020;
+ buf[255] = 0x00a5;
+ ahci_checksum((uint8_t *)buf, sizeof(buf));
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+ }
+}
+
+static void
+handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ if (!p->atapi) {
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ } else {
+ uint16_t buf[256];
+
+ memset(buf, 0, sizeof(buf));
+ buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
+ ata_string((uint8_t *)(buf+10), p->ident, 20);
+ ata_string((uint8_t *)(buf+23), "001", 8);
+ ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
+ buf[49] = (1 << 9 | 1 << 8);
+ buf[50] = (1 << 14 | 1);
+ buf[53] = (1 << 2 | 1 << 1);
+ buf[62] = 0x3f;
+ buf[63] = 7;
+ if (p->xfermode & ATA_WDMA0)
+ buf[63] |= (1 << ((p->xfermode & 7) + 8));
+ buf[64] = 3;
+ buf[65] = 120;
+ buf[66] = 120;
+ buf[67] = 120;
+ buf[68] = 120;
+ buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
+ buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
+ buf[78] = (1 << 5);
+ buf[80] = 0x3f0;
+ buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+ ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
+ buf[83] = (1 << 14);
+ buf[84] = (1 << 14);
+ buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+ ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
+ buf[87] = (1 << 14);
+ buf[88] = 0x7f;
+ if (p->xfermode & ATA_UDMA0)
+ buf[88] |= (1 << ((p->xfermode & 7) + 8));
+ buf[222] = 0x1020;
+ buf[255] = 0x00a5;
+ ahci_checksum((uint8_t *)buf, sizeof(buf));
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+ }
+}
+
+static void
+atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[36];
+ uint8_t *acmd;
+ int len;
+ uint32_t tfd;
+
+ acmd = cfis + 0x40;
+
+ if (acmd[1] & 1) { /* VPD */
+ if (acmd[2] == 0) { /* Supported VPD pages */
+ buf[0] = 0x05;
+ buf[1] = 0;
+ buf[2] = 0;
+ buf[3] = 1;
+ buf[4] = 0;
+ len = 4 + buf[3];
+ } else {
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+ return;
+ }
+ } else {
+ buf[0] = 0x05;
+ buf[1] = 0x80;
+ buf[2] = 0x00;
+ buf[3] = 0x21;
+ buf[4] = 31;
+ buf[5] = 0;
+ buf[6] = 0;
+ buf[7] = 0;
+ atapi_string(buf + 8, "BHYVE", 8);
+ atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
+ atapi_string(buf + 32, "001", 4);
+ len = sizeof(buf);
+ }
+
+ if (len > acmd[4])
+ len = acmd[4];
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ write_prdt(p, slot, cfis, buf, len);
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[8];
+ uint64_t sectors;
+
+ sectors = blockif_size(p->bctx) / 2048;
+ be32enc(buf, sectors - 1);
+ be32enc(buf + 4, 2048);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ write_prdt(p, slot, cfis, buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint8_t format;
+ int len;
+
+ acmd = cfis + 0x40;
+
+ len = be16dec(acmd + 7);
+ format = acmd[9] >> 6;
+ switch (format) {
+ case 0:
+ {
+ int msf, size;
+ uint64_t sectors;
+ uint8_t start_track, buf[20], *bp;
+
+ msf = (acmd[1] >> 1) & 1;
+ start_track = acmd[6];
+ if (start_track > 1 && start_track != 0xaa) {
+ uint32_t tfd;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+ return;
+ }
+ bp = buf + 2;
+ *bp++ = 1;
+ *bp++ = 1;
+ if (start_track <= 1) {
+ *bp++ = 0;
+ *bp++ = 0x14;
+ *bp++ = 1;
+ *bp++ = 0;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, 0);
+ bp += 3;
+ } else {
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ }
+ }
+ *bp++ = 0;
+ *bp++ = 0x14;
+ *bp++ = 0xaa;
+ *bp++ = 0;
+ sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ sectors >>= 2;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, sectors);
+ bp += 3;
+ } else {
+ be32enc(bp, sectors);
+ bp += 4;
+ }
+ size = bp - buf;
+ be16enc(buf, size - 2);
+ if (len > size)
+ len = size;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ case 1:
+ {
+ uint8_t buf[12];
+
+ memset(buf, 0, sizeof(buf));
+ buf[1] = 0xa;
+ buf[2] = 0x1;
+ buf[3] = 0x1;
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ case 2:
+ {
+ int msf, size;
+ uint64_t sectors;
+ uint8_t *bp, buf[50];
+
+ msf = (acmd[1] >> 1) & 1;
+ bp = buf + 2;
+ *bp++ = 1;
+ *bp++ = 1;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa1;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa2;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ sectors >>= 2;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, sectors);
+ bp += 3;
+ } else {
+ be32enc(bp, sectors);
+ bp += 4;
+ }
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, 0);
+ bp += 3;
+ } else {
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ }
+
+ size = bp - buf;
+ be16enc(buf, size - 2);
+ if (len > size)
+ len = size;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ default:
+ {
+ uint32_t tfd;
+
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+ break;
+ }
+ }
+}
+
+static void
+atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[16];
+
+ memset(buf, 0, sizeof(buf));
+ buf[3] = 8;
+
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ write_prdt(p, slot, cfis, buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
+{
+ struct ahci_ioreq *aior;
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ struct blockif_req *breq;
+ uint8_t *acmd;
+ uint64_t lba;
+ uint32_t len;
+ int err;
+
+ acmd = cfis + 0x40;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+ lba = be32dec(acmd + 2);
+ if (acmd[0] == READ_10)
+ len = be16dec(acmd + 7);
+ else
+ len = be32dec(acmd + 6);
+ if (len == 0) {
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ }
+ lba *= 2048;
+ len *= 2048;
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = len;
+ aior->done = done;
+ breq = &aior->io_req;
+ breq->br_offset = lba + done;
+ ahci_build_iov(p, aior, prdt, hdr->prdtl);
+
+ /* Mark this command in-flight. */
+ p->pending |= 1 << slot;
+
+ /* Stuff request onto busy list. */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ err = blockif_read(p->bctx, breq);
+ assert(err == 0);
+}
+
+static void
+atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[64];
+ uint8_t *acmd;
+ int len;
+
+ acmd = cfis + 0x40;
+ len = acmd[4];
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+ memset(buf, 0, len);
+ buf[0] = 0x70 | (1 << 7);
+ buf[2] = p->sense_key;
+ buf[7] = 10;
+ buf[12] = p->asc;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd = cfis + 0x40;
+ uint32_t tfd;
+
+ switch (acmd[4] & 3) {
+ case 0:
+ case 1:
+ case 3:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ case 2:
+ /* TODO eject media */
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x53;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint32_t tfd = 0;
+ uint8_t pc, code;
+ int len;
+
+ acmd = cfis + 0x40;
+ len = be16dec(acmd + 7);
+ pc = acmd[2] >> 6;
+ code = acmd[2] & 0x3f;
+
+ switch (pc) {
+ case 0:
+ switch (code) {
+ case MODEPAGE_RW_ERROR_RECOVERY:
+ {
+ uint8_t buf[16];
+
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 16 - 2);
+ buf[2] = 0x70;
+ buf[8] = 0x01;
+ buf[9] = 16 - 10;
+ buf[11] = 0x05;
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ }
+ case MODEPAGE_CD_CAPABILITIES:
+ {
+ uint8_t buf[30];
+
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 30 - 2);
+ buf[2] = 0x70;
+ buf[8] = 0x2A;
+ buf[9] = 30 - 10;
+ buf[10] = 0x08;
+ buf[12] = 0x71;
+ be16enc(&buf[18], 2);
+ be16enc(&buf[20], 512);
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ }
+ default:
+ goto error;
+ break;
+ }
+ break;
+ case 3:
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x39;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+error:
+ case 1:
+ case 2:
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+ }
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_get_event_status_notification(struct ahci_port *p, int slot,
+ uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint32_t tfd;
+
+ acmd = cfis + 0x40;
+
+ /* we don't support asynchronous operation */
+ if (!(acmd[1] & 1)) {
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ } else {
+ uint8_t buf[8];
+ int len;
+
+ len = be16dec(acmd + 7);
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 8 - 2);
+ buf[2] = 0x04;
+ buf[3] = 0x10;
+ buf[5] = 0x02;
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ }
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+
+ acmd = cfis + 0x40;
+
+#ifdef AHCI_DEBUG
+ {
+ int i;
+ DPRINTF("ACMD:");
+ for (i = 0; i < 16; i++)
+ DPRINTF("%02x ", acmd[i]);
+ DPRINTF("\n");
+ }
+#endif
+
+ switch (acmd[0]) {
+ case TEST_UNIT_READY:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case INQUIRY:
+ atapi_inquiry(p, slot, cfis);
+ break;
+ case READ_CAPACITY:
+ atapi_read_capacity(p, slot, cfis);
+ break;
+ case PREVENT_ALLOW:
+ /* TODO */
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case READ_TOC:
+ atapi_read_toc(p, slot, cfis);
+ break;
+ case REPORT_LUNS:
+ atapi_report_luns(p, slot, cfis);
+ break;
+ case READ_10:
+ case READ_12:
+ atapi_read(p, slot, cfis, 0);
+ break;
+ case REQUEST_SENSE:
+ atapi_request_sense(p, slot, cfis);
+ break;
+ case START_STOP_UNIT:
+ atapi_start_stop_unit(p, slot, cfis);
+ break;
+ case MODE_SENSE_10:
+ atapi_mode_sense(p, slot, cfis);
+ break;
+ case GET_EVENT_STATUS_NOTIFICATION:
+ atapi_get_event_status_notification(p, slot, cfis);
+ break;
+ default:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x20;
+ ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
+ ATA_S_READY | ATA_S_ERROR);
+ break;
+ }
+}
+
+static void
+ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+
+ p->tfd |= ATA_S_BUSY;
+ switch (cfis[2]) {
+ case ATA_ATA_IDENTIFY:
+ handle_identify(p, slot, cfis);
+ break;
+ case ATA_SETFEATURES:
+ {
+ switch (cfis[3]) {
+ case ATA_SF_ENAB_SATA_SF:
+ switch (cfis[12]) {
+ case ATA_SATA_SF_AN:
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ default:
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ break;
+ }
+ break;
+ case ATA_SF_ENAB_WCACHE:
+ case ATA_SF_DIS_WCACHE:
+ case ATA_SF_ENAB_RCACHE:
+ case ATA_SF_DIS_RCACHE:
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ case ATA_SF_SETXFER:
+ {
+ switch (cfis[12] & 0xf8) {
+ case ATA_PIO:
+ case ATA_PIO0:
+ break;
+ case ATA_WDMA0:
+ case ATA_UDMA0:
+ p->xfermode = (cfis[12] & 0x7);
+ break;
+ }
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ }
+ default:
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis, p->tfd);
+ break;
+ }
+ case ATA_SET_MULTI:
+ if (cfis[12] != 0 &&
+ (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ } else {
+ p->mult_sectors = cfis[12];
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ }
+ ahci_write_fis_d2h(p, slot, cfis, p->tfd);
+ break;
+ case ATA_READ:
+ case ATA_WRITE:
+ case ATA_READ48:
+ case ATA_WRITE48:
+ case ATA_READ_MUL:
+ case ATA_WRITE_MUL:
+ case ATA_READ_MUL48:
+ case ATA_WRITE_MUL48:
+ case ATA_READ_DMA:
+ case ATA_WRITE_DMA:
+ case ATA_READ_DMA48:
+ case ATA_WRITE_DMA48:
+ case ATA_READ_FPDMA_QUEUED:
+ case ATA_WRITE_FPDMA_QUEUED:
+ ahci_handle_rw(p, slot, cfis, 0);
+ break;
+ case ATA_FLUSHCACHE:
+ case ATA_FLUSHCACHE48:
+ ahci_handle_flush(p, slot, cfis);
+ break;
+ case ATA_DATA_SET_MANAGEMENT:
+ if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM &&
+ cfis[13] == 0 && cfis[12] == 1) {
+ ahci_handle_dsm_trim(p, slot, cfis, 0);
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ break;
+ case ATA_SEND_FPDMA_QUEUED:
+ if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM &&
+ cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM &&
+ cfis[11] == 0 && cfis[3] == 1) {
+ ahci_handle_dsm_trim(p, slot, cfis, 0);
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ break;
+ case ATA_READ_LOG_EXT:
+ case ATA_READ_LOG_DMA_EXT:
+ ahci_handle_read_log(p, slot, cfis);
+ break;
+ case ATA_SECURITY_FREEZE_LOCK:
+ case ATA_SMART_CMD:
+ case ATA_NOP:
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ break;
+ case ATA_CHECK_POWER_MODE:
+ cfis[12] = 0xff; /* always on */
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case ATA_STANDBY_CMD:
+ case ATA_STANDBY_IMMEDIATE:
+ case ATA_IDLE_CMD:
+ case ATA_IDLE_IMMEDIATE:
+ case ATA_SLEEP:
+ case ATA_READ_VERIFY:
+ case ATA_READ_VERIFY48:
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case ATA_ATAPI_IDENTIFY:
+ handle_atapi_identify(p, slot, cfis);
+ break;
+ case ATA_PACKET_CMD:
+ if (!p->atapi) {
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ } else
+ handle_packet_cmd(p, slot, cfis);
+ break;
+ default:
+ WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
+ ahci_write_fis_d2h(p, slot, cfis,
+ (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+ break;
+ }
+}
+
+static void
+ahci_handle_slot(struct ahci_port *p, int slot)
+{
+ struct ahci_cmd_hdr *hdr;
+#ifdef AHCI_DEBUG
+ struct ahci_prdt_entry *prdt;
+#endif
+ struct pci_ahci_softc *sc;
+ uint8_t *cfis;
+#ifdef AHCI_DEBUG
+ int cfl, i;
+#endif
+
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+#ifdef AHCI_DEBUG
+ cfl = (hdr->flags & 0x1f) * 4;
+#endif
+ cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
+ 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
+#ifdef AHCI_DEBUG
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+ DPRINTF("\ncfis:");
+ for (i = 0; i < cfl; i++) {
+ if (i % 10 == 0)
+ DPRINTF("\n");
+ DPRINTF("%02x ", cfis[i]);
+ }
+ DPRINTF("\n");
+
+ for (i = 0; i < hdr->prdtl; i++) {
+ DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba);
+ prdt++;
+ }
+#endif
+
+ if (cfis[0] != FIS_TYPE_REGH2D) {
+ WPRINTF("Not a H2D FIS:%02x\n", cfis[0]);
+ return;
+ }
+
+ if (cfis[1] & 0x80) {
+ ahci_handle_cmd(p, slot, cfis);
+ } else {
+ if (cfis[15] & (1 << 2))
+ p->reset = 1;
+ else if (p->reset) {
+ p->reset = 0;
+ ahci_port_reset(p);
+ }
+ p->ci &= ~(1 << slot);
+ }
+}
+
+static void
+ahci_handle_port(struct ahci_port *p)
+{
+
+ if (!(p->cmd & AHCI_P_CMD_ST))
+ return;
+
+ /*
+ * Search for any new commands to issue ignoring those that
+ * are already in-flight. Stop if device is busy or in error.
+ */
+ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) {
+ if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0)
+ break;
+ if (p->waitforclear)
+ break;
+ if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) {
+ p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+ p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT;
+ ahci_handle_slot(p, p->ccs);
+ }
+ }
+}
+
+/*
+ * blockif callback routine - this runs in the context of the blockif
+ * i/o thread, so the mutex needs to be acquired.
+ */
+static void
+ata_ioreq_cb(struct blockif_req *br, int err)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *aior;
+ struct ahci_port *p;
+ struct pci_ahci_softc *sc;
+ uint32_t tfd;
+ uint8_t *cfis;
+ int slot, ncq, dsm;
+
+ DPRINTF("%s %d\n", __func__, err);
+
+ ncq = dsm = 0;
+ aior = br->br_param;
+ p = aior->io_pr;
+ cfis = aior->cfis;
+ slot = aior->slot;
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED ||
+ cfis[2] == ATA_SEND_FPDMA_QUEUED)
+ ncq = 1;
+ if (cfis[2] == ATA_DATA_SET_MANAGEMENT ||
+ (cfis[2] == ATA_SEND_FPDMA_QUEUED &&
+ (cfis[13] & 0x1f) == ATA_SFPDMA_DSM))
+ dsm = 1;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+ if (!err)
+ hdr->prdbc = aior->done;
+
+ if (!err && aior->more) {
+ if (dsm)
+ ahci_handle_dsm_trim(p, slot, cfis, aior->done);
+ else
+ ahci_handle_rw(p, slot, cfis, aior->done);
+ goto out;
+ }
+
+ if (!err)
+ tfd = ATA_S_READY | ATA_S_DSC;
+ else
+ tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ if (ncq)
+ ahci_write_fis_sdb(p, slot, cfis, tfd);
+ else
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+ /*
+ * This command is now complete.
+ */
+ p->pending &= ~(1 << slot);
+
+ ahci_check_stopped(p);
+ ahci_handle_port(p);
+out:
+ pthread_mutex_unlock(&sc->mtx);
+ DPRINTF("%s exit\n", __func__);
+}
+
+static void
+atapi_ioreq_cb(struct blockif_req *br, int err)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *aior;
+ struct ahci_port *p;
+ struct pci_ahci_softc *sc;
+ uint8_t *cfis;
+ uint32_t tfd;
+ int slot;
+
+ DPRINTF("%s %d\n", __func__, err);
+
+ aior = br->br_param;
+ p = aior->io_pr;
+ cfis = aior->cfis;
+ slot = aior->slot;
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+ if (!err)
+ hdr->prdbc = aior->done;
+
+ if (!err && aior->more) {
+ atapi_read(p, slot, cfis, aior->done);
+ goto out;
+ }
+
+ if (!err) {
+ tfd = ATA_S_READY | ATA_S_DSC;
+ } else {
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x21;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ }
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+ /*
+ * This command is now complete.
+ */
+ p->pending &= ~(1 << slot);
+
+ ahci_check_stopped(p);
+ ahci_handle_port(p);
+out:
+ pthread_mutex_unlock(&sc->mtx);
+ DPRINTF("%s exit\n", __func__);
+}
+
+static void
+pci_ahci_ioreq_init(struct ahci_port *pr)
+{
+ struct ahci_ioreq *vr;
+ int i;
+
+ pr->ioqsz = blockif_queuesz(pr->bctx);
+ pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
+ STAILQ_INIT(&pr->iofhd);
+
+ /*
+ * Add all i/o request entries to the free queue
+ */
+ for (i = 0; i < pr->ioqsz; i++) {
+ vr = &pr->ioreq[i];
+ vr->io_pr = pr;
+ if (!pr->atapi)
+ vr->io_req.br_callback = ata_ioreq_cb;
+ else
+ vr->io_req.br_callback = atapi_ioreq_cb;
+ vr->io_req.br_param = vr;
+ STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
+ }
+
+ TAILQ_INIT(&pr->iobhd);
+}
+
+static void
+pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+ int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+ offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+ struct ahci_port *p = &sc->port[port];
+
+ DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+ port, offset, value);
+
+ switch (offset) {
+ case AHCI_P_CLB:
+ p->clb = value;
+ break;
+ case AHCI_P_CLBU:
+ p->clbu = value;
+ break;
+ case AHCI_P_FB:
+ p->fb = value;
+ break;
+ case AHCI_P_FBU:
+ p->fbu = value;
+ break;
+ case AHCI_P_IS:
+ p->is &= ~value;
+ ahci_port_intr(p);
+ break;
+ case AHCI_P_IE:
+ p->ie = value & 0xFDC000FF;
+ ahci_port_intr(p);
+ break;
+ case AHCI_P_CMD:
+ {
+ p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
+ AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
+ AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
+ AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK);
+ p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
+ AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
+ AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
+ AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value;
+
+ if (!(value & AHCI_P_CMD_ST)) {
+ ahci_port_stop(p);
+ } else {
+ uint64_t clb;
+
+ p->cmd |= AHCI_P_CMD_CR;
+ clb = (uint64_t)p->clbu << 32 | p->clb;
+ p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
+ AHCI_CL_SIZE * AHCI_MAX_SLOTS);
+ }
+
+ if (value & AHCI_P_CMD_FRE) {
+ uint64_t fb;
+
+ p->cmd |= AHCI_P_CMD_FR;
+ fb = (uint64_t)p->fbu << 32 | p->fb;
+ /* we don't support FBSCP, so rfis size is 256Bytes */
+ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
+ } else {
+ p->cmd &= ~AHCI_P_CMD_FR;
+ }
+
+ if (value & AHCI_P_CMD_CLO) {
+ p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ);
+ p->cmd &= ~AHCI_P_CMD_CLO;
+ }
+
+ if (value & AHCI_P_CMD_ICC_MASK) {
+ p->cmd &= ~AHCI_P_CMD_ICC_MASK;
+ }
+
+ ahci_handle_port(p);
+ break;
+ }
+ case AHCI_P_TFD:
+ case AHCI_P_SIG:
+ case AHCI_P_SSTS:
+ WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
+ break;
+ case AHCI_P_SCTL:
+ p->sctl = value;
+ if (!(p->cmd & AHCI_P_CMD_ST)) {
+ if (value & ATA_SC_DET_RESET)
+ ahci_port_reset(p);
+ }
+ break;
+ case AHCI_P_SERR:
+ p->serr &= ~value;
+ break;
+ case AHCI_P_SACT:
+ p->sact |= value;
+ break;
+ case AHCI_P_CI:
+ p->ci |= value;
+ ahci_handle_port(p);
+ break;
+ case AHCI_P_SNTF:
+ case AHCI_P_FBS:
+ default:
+ break;
+ }
+}
+
+static void
+pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+ DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+ offset, value);
+
+ switch (offset) {
+ case AHCI_CAP:
+ case AHCI_PI:
+ case AHCI_VS:
+ case AHCI_CAP2:
+ DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
+ break;
+ case AHCI_GHC:
+ if (value & AHCI_GHC_HR) {
+ ahci_reset(sc);
+ break;
+ }
+ if (value & AHCI_GHC_IE)
+ sc->ghc |= AHCI_GHC_IE;
+ else
+ sc->ghc &= ~AHCI_GHC_IE;
+ ahci_generate_intr(sc, 0xffffffff);
+ break;
+ case AHCI_IS:
+ sc->is &= ~value;
+ ahci_generate_intr(sc, value);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_ahci_softc *sc = pi->pi_arg;
+
+ assert(baridx == 5);
+ assert((offset % 4) == 0 && size == 4);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (offset < AHCI_OFFSET)
+ pci_ahci_host_write(sc, offset, value);
+ else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+ pci_ahci_port_write(sc, offset, value);
+ else
+ WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset);
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static uint64_t
+pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+
+ switch (offset) {
+ case AHCI_CAP:
+ case AHCI_GHC:
+ case AHCI_IS:
+ case AHCI_PI:
+ case AHCI_VS:
+ case AHCI_CCCC:
+ case AHCI_CCCP:
+ case AHCI_EM_LOC:
+ case AHCI_EM_CTL:
+ case AHCI_CAP2:
+ {
+ uint32_t *p = &sc->cap;
+ p += (offset - AHCI_CAP) / sizeof(uint32_t);
+ value = *p;
+ break;
+ }
+ default:
+ value = 0;
+ break;
+ }
+ DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n",
+ offset, value);
+
+ return (value);
+}
+
+static uint64_t
+pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+ int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+ offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+
+ switch (offset) {
+ case AHCI_P_CLB:
+ case AHCI_P_CLBU:
+ case AHCI_P_FB:
+ case AHCI_P_FBU:
+ case AHCI_P_IS:
+ case AHCI_P_IE:
+ case AHCI_P_CMD:
+ case AHCI_P_TFD:
+ case AHCI_P_SIG:
+ case AHCI_P_SSTS:
+ case AHCI_P_SCTL:
+ case AHCI_P_SERR:
+ case AHCI_P_SACT:
+ case AHCI_P_CI:
+ case AHCI_P_SNTF:
+ case AHCI_P_FBS:
+ {
+ uint32_t *p= &sc->port[port].clb;
+ p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
+ value = *p;
+ break;
+ }
+ default:
+ value = 0;
+ break;
+ }
+
+ DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n",
+ port, offset, value);
+
+ return value;
+}
+
+static uint64_t
+pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t regoff, int size)
+{
+ struct pci_ahci_softc *sc = pi->pi_arg;
+ uint64_t offset;
+ uint32_t value;
+
+ assert(baridx == 5);
+ assert(size == 1 || size == 2 || size == 4);
+ assert((regoff & (size - 1)) == 0);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */
+ if (offset < AHCI_OFFSET)
+ value = pci_ahci_host_read(sc, offset);
+ else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+ value = pci_ahci_port_read(sc, offset);
+ else {
+ value = 0;
+ WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n",
+ regoff);
+ }
+ value >>= 8 * (regoff & 0x3);
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (value);
+}
+
+static int
+pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
+{
+ char bident[sizeof("XX:XX:XX")];
+ struct blockif_ctxt *bctxt;
+ struct pci_ahci_softc *sc;
+ int ret, slots, p;
+ MD5_CTX mdctx;
+ u_char digest[16];
+ char *next, *next2;
+
+ ret = 0;
+
+#ifdef AHCI_DEBUG
+ dbg = fopen("/tmp/log", "w+");
+#endif
+
+ sc = calloc(1, sizeof(struct pci_ahci_softc));
+ pi->pi_arg = sc;
+ sc->asc_pi = pi;
+ pthread_mutex_init(&sc->mtx, NULL);
+ sc->ports = 0;
+ sc->pi = 0;
+ slots = 32;
+
+ for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) {
+ /* Identify and cut off type of present port. */
+ if (strncmp(opts, "hd:", 3) == 0) {
+ atapi = 0;
+ opts += 3;
+ } else if (strncmp(opts, "cd:", 3) == 0) {
+ atapi = 1;
+ opts += 3;
+ }
+
+ /* Find and cut off the next port options. */
+ next = strstr(opts, ",hd:");
+ next2 = strstr(opts, ",cd:");
+ if (next == NULL || (next2 != NULL && next2 < next))
+ next = next2;
+ if (next != NULL) {
+ next[0] = 0;
+ next++;
+ }
+
+ if (opts[0] == 0)
+ continue;
+
+ /*
+ * Attempt to open the backing image. Use the PCI slot/func
+ * and the port number for the identifier string.
+ */
+ snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot,
+ pi->pi_func, p);
+ bctxt = blockif_open(opts, bident);
+ if (bctxt == NULL) {
+ sc->ports = p;
+ ret = 1;
+ goto open_fail;
+ }
+ sc->port[p].bctx = bctxt;
+ sc->port[p].pr_sc = sc;
+ sc->port[p].port = p;
+ sc->port[p].atapi = atapi;
+
+#ifndef __FreeBSD__
+ /*
+ * Attempt to enable the write cache for this device, as the
+ * guest will issue FLUSH commands when it requires durability.
+ *
+ * Failure here is fine, since an always-sync device will not
+ * have an impact on correctness.
+ */
+ (void) blockif_set_wce(bctxt, 1);
+#endif
+
+ /*
+ * Create an identifier for the backing file.
+ * Use parts of the md5 sum of the filename
+ */
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, opts, strlen(opts));
+ MD5Final(digest, &mdctx);
+ snprintf(sc->port[p].ident, AHCI_PORT_IDENT,
+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4],
+ digest[5]);
+
+ /*
+ * Allocate blockif request structures and add them
+ * to the free list
+ */
+ pci_ahci_ioreq_init(&sc->port[p]);
+
+ sc->pi |= (1 << p);
+ if (sc->port[p].ioqsz < slots)
+ slots = sc->port[p].ioqsz;
+ }
+ sc->ports = p;
+
+ /* Intel ICH8 AHCI */
+ --slots;
+ if (sc->ports < DEF_PORTS)
+ sc->ports = DEF_PORTS;
+ sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
+ AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
+ AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
+ AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
+ (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
+
+ sc->vs = 0x10300;
+ sc->cap2 = AHCI_CAP2_APST;
+ ahci_reset(sc);
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
+ pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
+ p = MIN(sc->ports, 16);
+ p = flsl(p) - ((p & (p - 1)) ? 0 : 1);
+ pci_emul_add_msicap(pi, 1 << p);
+ pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
+ AHCI_OFFSET + sc->ports * AHCI_STEP);
+
+ pci_lintr_request(pi);
+
+open_fail:
+ if (ret) {
+ for (p = 0; p < sc->ports; p++) {
+ if (sc->port[p].bctx != NULL)
+ blockif_close(sc->port[p].bctx);
+ }
+ free(sc);
+ }
+
+ return (ret);
+}
+
+static int
+pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ return (pci_ahci_init(ctx, pi, opts, 0));
+}
+
+static int
+pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ return (pci_ahci_init(ctx, pi, opts, 1));
+}
+
+/*
+ * Use separate emulation names to distinguish drive and atapi devices
+ */
+struct pci_devemu pci_de_ahci = {
+ .pe_emu = "ahci",
+ .pe_init = pci_ahci_hd_init,
+ .pe_barwrite = pci_ahci_write,
+ .pe_barread = pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci);
+
+struct pci_devemu pci_de_ahci_hd = {
+ .pe_emu = "ahci-hd",
+ .pe_init = pci_ahci_hd_init,
+ .pe_barwrite = pci_ahci_write,
+ .pe_barread = pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_hd);
+
+struct pci_devemu pci_de_ahci_cd = {
+ .pe_emu = "ahci-cd",
+ .pe_init = pci_ahci_atapi_init,
+ .pe_barwrite = pci_ahci_write,
+ .pe_barread = pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c
new file mode 100644
index 0000000000..e211b5cf9c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_e82545.c
@@ -0,0 +1,2418 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
+ * Copyright (c) 2013 Jeremiah Lott, Avere Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/limits.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#ifndef __FreeBSD__
+#include <sys/filio.h>
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <md5.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "e1000_regs.h"
+#include "e1000_defines.h"
+#include "mii.h"
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+/* Hardware/register definitions XXX: move some to common code. */
+#define E82545_VENDOR_ID_INTEL 0x8086
+#define E82545_DEV_ID_82545EM_COPPER 0x100F
+#define E82545_SUBDEV_ID 0x1008
+
+#define E82545_REVISION_4 4
+
+#define E82545_MDIC_DATA_MASK 0x0000FFFF
+#define E82545_MDIC_OP_MASK 0x0c000000
+#define E82545_MDIC_IE 0x20000000
+
+#define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */
+#define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */
+#define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */
+
+#define E82545_BAR_REGISTER 0
+#define E82545_BAR_REGISTER_LEN (128*1024)
+#define E82545_BAR_FLASH 1
+#define E82545_BAR_FLASH_LEN (64*1024)
+#define E82545_BAR_IO 2
+#define E82545_BAR_IO_LEN 8
+
+#define E82545_IOADDR 0x00000000
+#define E82545_IODATA 0x00000004
+#define E82545_IO_REGISTER_MAX 0x0001FFFF
+#define E82545_IO_FLASH_BASE 0x00080000
+#define E82545_IO_FLASH_MAX 0x000FFFFF
+
+#define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2))
+#define E82545_RAR_MAX 15
+#define E82545_MTA_MAX 127
+#define E82545_VFTA_MAX 127
+
+/* Slightly modified from the driver versions, hardcoded for 3 opcode bits,
+ * followed by 6 address bits.
+ * TODO: make opcode bits and addr bits configurable?
+ * NVM Commands - Microwire */
+#define E82545_NVM_OPCODE_BITS 3
+#define E82545_NVM_ADDR_BITS 6
+#define E82545_NVM_DATA_BITS 16
+#define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS)
+#define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1)
+#define E82545_NVM_OPCODE_MASK \
+ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS)
+#define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */
+#define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */
+#define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */
+#define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */
+
+#define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */
+
+#define E1000_ICR_SRPD 0x00010000
+
+/* This is an arbitrary number. There is no hard limit on the chip. */
+#define I82545_MAX_TXSEGS 64
+
+/* Legacy receive descriptor */
+struct e1000_rx_desc {
+ uint64_t buffer_addr; /* Address of the descriptor's data buffer */
+ uint16_t length; /* Length of data DMAed into data buffer */
+ uint16_t csum; /* Packet checksum */
+ uint8_t status; /* Descriptor status */
+ uint8_t errors; /* Descriptor Errors */
+ uint16_t special;
+};
+
+/* Transmit descriptor types */
+#define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000)
+#define E1000_TXD_TYP_L (0)
+#define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C)
+#define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)
+
+/* Legacy transmit descriptor */
+struct e1000_tx_desc {
+ uint64_t buffer_addr; /* Address of the descriptor's data buffer */
+ union {
+ uint32_t data;
+ struct {
+ uint16_t length; /* Data buffer length */
+ uint8_t cso; /* Checksum offset */
+ uint8_t cmd; /* Descriptor control */
+ } flags;
+ } lower;
+ union {
+ uint32_t data;
+ struct {
+ uint8_t status; /* Descriptor status */
+ uint8_t css; /* Checksum start */
+ uint16_t special;
+ } fields;
+ } upper;
+};
+
+/* Context descriptor */
+struct e1000_context_desc {
+ union {
+ uint32_t ip_config;
+ struct {
+ uint8_t ipcss; /* IP checksum start */
+ uint8_t ipcso; /* IP checksum offset */
+ uint16_t ipcse; /* IP checksum end */
+ } ip_fields;
+ } lower_setup;
+ union {
+ uint32_t tcp_config;
+ struct {
+ uint8_t tucss; /* TCP checksum start */
+ uint8_t tucso; /* TCP checksum offset */
+ uint16_t tucse; /* TCP checksum end */
+ } tcp_fields;
+ } upper_setup;
+ uint32_t cmd_and_length;
+ union {
+ uint32_t data;
+ struct {
+ uint8_t status; /* Descriptor status */
+ uint8_t hdr_len; /* Header length */
+ uint16_t mss; /* Maximum segment size */
+ } fields;
+ } tcp_seg_setup;
+};
+
+/* Data descriptor */
+struct e1000_data_desc {
+ uint64_t buffer_addr; /* Address of the descriptor's buffer address */
+ union {
+ uint32_t data;
+ struct {
+ uint16_t length; /* Data buffer length */
+ uint8_t typ_len_ext;
+ uint8_t cmd;
+ } flags;
+ } lower;
+ union {
+ uint32_t data;
+ struct {
+ uint8_t status; /* Descriptor status */
+ uint8_t popts; /* Packet Options */
+ uint16_t special;
+ } fields;
+ } upper;
+};
+
+union e1000_tx_udesc {
+ struct e1000_tx_desc td;
+ struct e1000_context_desc cd;
+ struct e1000_data_desc dd;
+};
+
+/* Tx checksum info for a packet. */
+struct ck_info {
+ int ck_valid; /* ck_info is valid */
+ uint8_t ck_start; /* start byte of cksum calcuation */
+ uint8_t ck_off; /* offset of cksum insertion */
+ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */
+};
+
+/*
+ * Debug printf
+ */
+static int e82545_debug = 0;
+#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params)
+#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params)
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+/* s/w representation of the RAL/RAH regs */
+struct eth_uni {
+ int eu_valid;
+ int eu_addrsel;
+ struct ether_addr eu_eth;
+};
+
+
+struct e82545_softc {
+ struct pci_devinst *esc_pi;
+ struct vmctx *esc_ctx;
+ struct mevent *esc_mevp;
+ struct mevent *esc_mevpitr;
+ pthread_mutex_t esc_mtx;
+ struct ether_addr esc_mac;
+ int esc_tapfd;
+
+ /* General */
+ uint32_t esc_CTRL; /* x0000 device ctl */
+ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */
+ uint32_t esc_FCAH; /* x002C flow ctl addr hi */
+ uint32_t esc_FCT; /* x0030 flow ctl type */
+ uint32_t esc_VET; /* x0038 VLAN eth type */
+ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */
+ uint32_t esc_LEDCTL; /* x0E00 LED control */
+ uint32_t esc_PBA; /* x1000 pkt buffer allocation */
+
+ /* Interrupt control */
+ int esc_irq_asserted;
+ uint32_t esc_ICR; /* x00C0 cause read/clear */
+ uint32_t esc_ITR; /* x00C4 intr throttling */
+ uint32_t esc_ICS; /* x00C8 cause set */
+ uint32_t esc_IMS; /* x00D0 mask set/read */
+ uint32_t esc_IMC; /* x00D8 mask clear */
+
+ /* Transmit */
+ union e1000_tx_udesc *esc_txdesc;
+ struct e1000_context_desc esc_txctx;
+ pthread_t esc_tx_tid;
+ pthread_cond_t esc_tx_cond;
+ int esc_tx_enabled;
+ int esc_tx_active;
+ uint32_t esc_TXCW; /* x0178 transmit config */
+ uint32_t esc_TCTL; /* x0400 transmit ctl */
+ uint32_t esc_TIPG; /* x0410 inter-packet gap */
+ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */
+ uint64_t esc_tdba; /* verified 64-bit desc table addr */
+ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */
+ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */
+ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */
+ uint16_t esc_TDH; /* x3810 desc table head idx */
+ uint16_t esc_TDHr; /* internal read version of TDH */
+ uint16_t esc_TDT; /* x3818 desc table tail idx */
+ uint32_t esc_TIDV; /* x3820 intr delay */
+ uint32_t esc_TXDCTL; /* x3828 desc control */
+ uint32_t esc_TADV; /* x382C intr absolute delay */
+
+ /* L2 frame acceptance */
+ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */
+ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */
+ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */
+
+ /* Receive */
+ struct e1000_rx_desc *esc_rxdesc;
+ pthread_cond_t esc_rx_cond;
+ int esc_rx_enabled;
+ int esc_rx_active;
+ int esc_rx_loopback;
+ uint32_t esc_RCTL; /* x0100 receive ctl */
+ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */
+ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */
+ uint64_t esc_rdba; /* verified 64-bit desc table addr */
+ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */
+ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/
+ uint32_t esc_RDLEN; /* x2808 #descriptors */
+ uint16_t esc_RDH; /* x2810 desc table head idx */
+ uint16_t esc_RDT; /* x2818 desc table tail idx */
+ uint32_t esc_RDTR; /* x2820 intr delay */
+ uint32_t esc_RXDCTL; /* x2828 desc control */
+ uint32_t esc_RADV; /* x282C intr absolute delay */
+ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */
+ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */
+
+ /* IO Port register access */
+ uint32_t io_addr;
+
+ /* Shadow copy of MDIC */
+ uint32_t mdi_control;
+ /* Shadow copy of EECD */
+ uint32_t eeprom_control;
+ /* Latest NVM in/out */
+ uint16_t nvm_data;
+ uint16_t nvm_opaddr;
+ /* stats */
+ uint32_t missed_pkt_count; /* dropped for no room in rx queue */
+ uint32_t pkt_rx_by_size[6];
+ uint32_t pkt_tx_by_size[6];
+ uint32_t good_pkt_rx_count;
+ uint32_t bcast_pkt_rx_count;
+ uint32_t mcast_pkt_rx_count;
+ uint32_t good_pkt_tx_count;
+ uint32_t bcast_pkt_tx_count;
+ uint32_t mcast_pkt_tx_count;
+ uint32_t oversize_rx_count;
+ uint32_t tso_tx_count;
+ uint64_t good_octets_rx;
+ uint64_t good_octets_tx;
+ uint64_t missed_octets; /* counts missed and oversized */
+
+ uint8_t nvm_bits:6; /* number of bits remaining in/out */
+ uint8_t nvm_mode:2;
+#define E82545_NVM_MODE_OPADDR 0x0
+#define E82545_NVM_MODE_DATAIN 0x1
+#define E82545_NVM_MODE_DATAOUT 0x2
+ /* EEPROM data */
+ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE];
+};
+
+static void e82545_reset(struct e82545_softc *sc, int dev);
+static void e82545_rx_enable(struct e82545_softc *sc);
+static void e82545_rx_disable(struct e82545_softc *sc);
+#ifdef __FreeBSD__
+static void e82545_tap_callback(int fd, enum ev_type type, void *param);
+#endif
+static void e82545_tx_start(struct e82545_softc *sc);
+static void e82545_tx_enable(struct e82545_softc *sc);
+static void e82545_tx_disable(struct e82545_softc *sc);
+
+static inline int
+e82545_size_stat_index(uint32_t size)
+{
+ if (size <= 64) {
+ return 0;
+ } else if (size >= 1024) {
+ return 5;
+ } else {
+ /* should be 1-4 */
+ return (ffs(size) - 6);
+ }
+}
+
+static void
+e82545_init_eeprom(struct e82545_softc *sc)
+{
+ uint16_t checksum, i;
+
+ /* mac addr */
+ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) |
+ (((uint16_t)sc->esc_mac.octet[1]) << 8);
+ sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) |
+ (((uint16_t)sc->esc_mac.octet[3]) << 8);
+ sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) |
+ (((uint16_t)sc->esc_mac.octet[5]) << 8);
+
+ /* pci ids */
+ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID;
+ sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL;
+ sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER;
+ sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL;
+
+ /* fill in the checksum */
+ checksum = 0;
+ for (i = 0; i < NVM_CHECKSUM_REG; i++) {
+ checksum += sc->eeprom_data[i];
+ }
+ checksum = NVM_SUM - checksum;
+ sc->eeprom_data[NVM_CHECKSUM_REG] = checksum;
+ DPRINTF("eeprom checksum: 0x%x\r\n", checksum);
+}
+
+static void
+e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+ uint8_t phy_addr, uint32_t data)
+{
+ DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data);
+}
+
+static uint32_t
+e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+ uint8_t phy_addr)
+{
+ //DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+ switch (reg_addr) {
+ case PHY_STATUS:
+ return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS |
+ MII_SR_AUTONEG_COMPLETE);
+ case PHY_AUTONEG_ADV:
+ return NWAY_AR_SELECTOR_FIELD;
+ case PHY_LP_ABILITY:
+ return 0;
+ case PHY_1000T_STATUS:
+ return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS |
+ SR_1000T_LOCAL_RX_STATUS);
+ case PHY_ID1:
+ return (M88E1011_I_PHY_ID >> 16) & 0xFFFF;
+ case PHY_ID2:
+ return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF;
+ default:
+ DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+ return 0;
+ }
+ /* not reached */
+}
+
+static void
+e82545_eecd_strobe(struct e82545_softc *sc)
+{
+ /* Microwire state machine */
+ /*
+ DPRINTF("eeprom state machine srtobe "
+ "0x%x 0x%x 0x%x 0x%x\r\n",
+ sc->nvm_mode, sc->nvm_bits,
+ sc->nvm_opaddr, sc->nvm_data);*/
+
+ if (sc->nvm_bits == 0) {
+ DPRINTF("eeprom state machine not expecting data! "
+ "0x%x 0x%x 0x%x 0x%x\r\n",
+ sc->nvm_mode, sc->nvm_bits,
+ sc->nvm_opaddr, sc->nvm_data);
+ return;
+ }
+ sc->nvm_bits--;
+ if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) {
+ /* shifting out */
+ if (sc->nvm_data & 0x8000) {
+ sc->eeprom_control |= E1000_EECD_DO;
+ } else {
+ sc->eeprom_control &= ~E1000_EECD_DO;
+ }
+ sc->nvm_data <<= 1;
+ if (sc->nvm_bits == 0) {
+ /* read done, back to opcode mode. */
+ sc->nvm_opaddr = 0;
+ sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+ sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+ }
+ } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) {
+ /* shifting in */
+ sc->nvm_data <<= 1;
+ if (sc->eeprom_control & E1000_EECD_DI) {
+ sc->nvm_data |= 1;
+ }
+ if (sc->nvm_bits == 0) {
+ /* eeprom write */
+ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+ uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK;
+ if (op != E82545_NVM_OPCODE_WRITE) {
+ DPRINTF("Illegal eeprom write op 0x%x\r\n",
+ sc->nvm_opaddr);
+ } else if (addr >= E82545_NVM_EEPROM_SIZE) {
+ DPRINTF("Illegal eeprom write addr 0x%x\r\n",
+ sc->nvm_opaddr);
+ } else {
+ DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n",
+ addr, sc->nvm_data);
+ sc->eeprom_data[addr] = sc->nvm_data;
+ }
+ /* back to opcode mode */
+ sc->nvm_opaddr = 0;
+ sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+ sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+ }
+ } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) {
+ sc->nvm_opaddr <<= 1;
+ if (sc->eeprom_control & E1000_EECD_DI) {
+ sc->nvm_opaddr |= 1;
+ }
+ if (sc->nvm_bits == 0) {
+ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+ switch (op) {
+ case E82545_NVM_OPCODE_EWEN:
+ DPRINTF("eeprom write enable: 0x%x\r\n",
+ sc->nvm_opaddr);
+ /* back to opcode mode */
+ sc->nvm_opaddr = 0;
+ sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+ sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+ break;
+ case E82545_NVM_OPCODE_READ:
+ {
+ uint16_t addr = sc->nvm_opaddr &
+ E82545_NVM_ADDR_MASK;
+ sc->nvm_mode = E82545_NVM_MODE_DATAOUT;
+ sc->nvm_bits = E82545_NVM_DATA_BITS;
+ if (addr < E82545_NVM_EEPROM_SIZE) {
+ sc->nvm_data = sc->eeprom_data[addr];
+ DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n",
+ addr, sc->nvm_data);
+ } else {
+ DPRINTF("eeprom illegal read: 0x%x\r\n",
+ sc->nvm_opaddr);
+ sc->nvm_data = 0;
+ }
+ break;
+ }
+ case E82545_NVM_OPCODE_WRITE:
+ sc->nvm_mode = E82545_NVM_MODE_DATAIN;
+ sc->nvm_bits = E82545_NVM_DATA_BITS;
+ sc->nvm_data = 0;
+ break;
+ default:
+ DPRINTF("eeprom unknown op: 0x%x\r\r",
+ sc->nvm_opaddr);
+ /* back to opcode mode */
+ sc->nvm_opaddr = 0;
+ sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+ sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+ }
+ }
+ } else {
+ DPRINTF("eeprom state machine wrong state! "
+ "0x%x 0x%x 0x%x 0x%x\r\n",
+ sc->nvm_mode, sc->nvm_bits,
+ sc->nvm_opaddr, sc->nvm_data);
+ }
+}
+
+#ifdef __FreeBSD__
+static void
+e82545_itr_callback(int fd, enum ev_type type, void *param)
+{
+ uint32_t new;
+ struct e82545_softc *sc = param;
+
+ pthread_mutex_lock(&sc->esc_mtx);
+ new = sc->esc_ICR & sc->esc_IMS;
+ if (new && !sc->esc_irq_asserted) {
+ DPRINTF("itr callback: lintr assert %x\r\n", new);
+ sc->esc_irq_asserted = 1;
+ pci_lintr_assert(sc->esc_pi);
+ } else {
+ mevent_delete(sc->esc_mevpitr);
+ sc->esc_mevpitr = NULL;
+ }
+ pthread_mutex_unlock(&sc->esc_mtx);
+}
+#endif
+
+static void
+e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
+{
+ uint32_t new;
+
+ DPRINTF("icr assert: 0x%x\r\n", bits);
+
+ /*
+ * An interrupt is only generated if bits are set that
+ * aren't already in the ICR, these bits are unmasked,
+ * and there isn't an interrupt already pending.
+ */
+ new = bits & ~sc->esc_ICR & sc->esc_IMS;
+ sc->esc_ICR |= bits;
+
+ if (new == 0) {
+ DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS);
+ } else if (sc->esc_mevpitr != NULL) {
+ DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+ } else if (!sc->esc_irq_asserted) {
+ DPRINTF("icr assert: lintr assert %x\r\n", new);
+ sc->esc_irq_asserted = 1;
+ pci_lintr_assert(sc->esc_pi);
+ if (sc->esc_ITR != 0) {
+#ifdef __FreeBSD__
+ sc->esc_mevpitr = mevent_add(
+ (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
+ EVF_TIMER, e82545_itr_callback, sc);
+#endif
+ }
+ }
+}
+
+static void
+e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
+{
+ uint32_t new;
+
+ /*
+ * Changing the mask may allow previously asserted
+ * but masked interrupt requests to generate an interrupt.
+ */
+ new = bits & sc->esc_ICR & ~sc->esc_IMS;
+ sc->esc_IMS |= bits;
+
+ if (new == 0) {
+ DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS);
+ } else if (sc->esc_mevpitr != NULL) {
+ DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+ } else if (!sc->esc_irq_asserted) {
+ DPRINTF("ims change: lintr assert %x\n\r", new);
+ sc->esc_irq_asserted = 1;
+ pci_lintr_assert(sc->esc_pi);
+ if (sc->esc_ITR != 0) {
+#ifdef __FreeBSD__
+ sc->esc_mevpitr = mevent_add(
+ (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
+ EVF_TIMER, e82545_itr_callback, sc);
+#endif
+ }
+ }
+}
+
+static void
+e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits)
+{
+
+ DPRINTF("icr deassert: 0x%x\r\n", bits);
+ sc->esc_ICR &= ~bits;
+
+ /*
+ * If there are no longer any interrupt sources and there
+ * was an asserted interrupt, clear it
+ */
+ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) {
+ DPRINTF("icr deassert: lintr deassert %x\r\n", bits);
+ pci_lintr_deassert(sc->esc_pi);
+ sc->esc_irq_asserted = 0;
+ }
+}
+
+static void
+e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+
+ DPRINTF("intr_write: off %x, val %x\n\r", offset, value);
+
+ switch (offset) {
+ case E1000_ICR:
+ e82545_icr_deassert(sc, value);
+ break;
+ case E1000_ITR:
+ sc->esc_ITR = value;
+ break;
+ case E1000_ICS:
+ sc->esc_ICS = value; /* not used: store for debug */
+ e82545_icr_assert(sc, value);
+ break;
+ case E1000_IMS:
+ e82545_ims_change(sc, value);
+ break;
+ case E1000_IMC:
+ sc->esc_IMC = value; /* for debug */
+ sc->esc_IMS &= ~value;
+ // XXX clear interrupts if all ICR bits now masked
+ // and interrupt was pending ?
+ break;
+ default:
+ break;
+ }
+}
+
+static uint32_t
+e82545_intr_read(struct e82545_softc *sc, uint32_t offset)
+{
+ uint32_t retval;
+
+ retval = 0;
+
+ DPRINTF("intr_read: off %x\n\r", offset);
+
+ switch (offset) {
+ case E1000_ICR:
+ retval = sc->esc_ICR;
+ sc->esc_ICR = 0;
+ e82545_icr_deassert(sc, ~0);
+ break;
+ case E1000_ITR:
+ retval = sc->esc_ITR;
+ break;
+ case E1000_ICS:
+ /* write-only register */
+ break;
+ case E1000_IMS:
+ retval = sc->esc_IMS;
+ break;
+ case E1000_IMC:
+ /* write-only register */
+ break;
+ default:
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+e82545_devctl(struct e82545_softc *sc, uint32_t val)
+{
+
+ sc->esc_CTRL = val & ~E1000_CTRL_RST;
+
+ if (val & E1000_CTRL_RST) {
+ DPRINTF("e1k: s/w reset, ctl %x\n", val);
+ e82545_reset(sc, 1);
+ }
+ /* XXX check for phy reset ? */
+}
+
+static void
+e82545_rx_update_rdba(struct e82545_softc *sc)
+{
+
+ /* XXX verify desc base/len within phys mem range */
+ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 |
+ sc->esc_RDBAL;
+
+ /* Cache host mapping of guest descriptor array */
+ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx,
+ sc->esc_rdba, sc->esc_RDLEN);
+}
+
+static void
+e82545_rx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+ int on;
+
+ on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN);
+
+ /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */
+ sc->esc_RCTL = val & ~0xF9204c01;
+
+ DPRINTF("rx_ctl - %s RCTL %x, val %x\n",
+ on ? "on" : "off", sc->esc_RCTL, val);
+
+ /* state change requested */
+ if (on != sc->esc_rx_enabled) {
+ if (on) {
+ /* Catch disallowed/unimplemented settings */
+ //assert(!(val & E1000_RCTL_LBM_TCVR));
+
+ if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) {
+ sc->esc_rx_loopback = 1;
+ } else {
+ sc->esc_rx_loopback = 0;
+ }
+
+ e82545_rx_update_rdba(sc);
+ e82545_rx_enable(sc);
+ } else {
+ e82545_rx_disable(sc);
+ sc->esc_rx_loopback = 0;
+ sc->esc_rdba = 0;
+ sc->esc_rxdesc = NULL;
+ }
+ }
+}
+
+static void
+e82545_tx_update_tdba(struct e82545_softc *sc)
+{
+
+ /* XXX verify desc base/len within phys mem range */
+ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL;
+
+ /* Cache host mapping of guest descriptor array */
+ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba,
+ sc->esc_TDLEN);
+}
+
+static void
+e82545_tx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+ int on;
+
+ on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN);
+
+ /* ignore TCTL_EN settings that don't change state */
+ if (on == sc->esc_tx_enabled)
+ return;
+
+ if (on) {
+ e82545_tx_update_tdba(sc);
+ e82545_tx_enable(sc);
+ } else {
+ e82545_tx_disable(sc);
+ sc->esc_tdba = 0;
+ sc->esc_txdesc = NULL;
+ }
+
+ /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */
+ sc->esc_TCTL = val & ~0xFE800005;
+}
+
+int
+e82545_bufsz(uint32_t rctl)
+{
+
+ switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) {
+ case (E1000_RCTL_SZ_2048): return (2048);
+ case (E1000_RCTL_SZ_1024): return (1024);
+ case (E1000_RCTL_SZ_512): return (512);
+ case (E1000_RCTL_SZ_256): return (256);
+ case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384);
+ case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192);
+ case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096);
+ }
+ return (256); /* Forbidden value. */
+}
+
+#ifdef __FreeBSD__
+static uint8_t dummybuf[2048];
+
+/* XXX one packet at a time until this is debugged */
+static void
+e82545_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct e82545_softc *sc = param;
+ struct e1000_rx_desc *rxd;
+ struct iovec vec[64];
+ int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size;
+ uint32_t cause = 0;
+ uint16_t *tp, tag, head;
+
+ pthread_mutex_lock(&sc->esc_mtx);
+ DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+
+ if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
+ DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n",
+ sc->esc_rx_enabled, sc->esc_rx_loopback);
+ while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ }
+ goto done1;
+ }
+ bufsz = e82545_bufsz(sc->esc_RCTL);
+ maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522;
+ maxpktdesc = (maxpktsz + bufsz - 1) / bufsz;
+ size = sc->esc_RDLEN / 16;
+ head = sc->esc_RDH;
+ left = (size + sc->esc_RDT - head) % size;
+ if (left < maxpktdesc) {
+ DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n",
+ left, maxpktdesc);
+ while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ }
+ goto done1;
+ }
+
+ sc->esc_rx_active = 1;
+ pthread_mutex_unlock(&sc->esc_mtx);
+
+ for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) {
+
+ /* Grab rx descriptor pointed to by the head pointer */
+ for (i = 0; i < maxpktdesc; i++) {
+ rxd = &sc->esc_rxdesc[(head + i) % size];
+ vec[i].iov_base = paddr_guest2host(sc->esc_ctx,
+ rxd->buffer_addr, bufsz);
+ vec[i].iov_len = bufsz;
+ }
+ len = readv(sc->esc_tapfd, vec, maxpktdesc);
+ if (len <= 0) {
+ DPRINTF("tap: readv() returned %d\n", len);
+ goto done;
+ }
+
+ /*
+ * Adjust the packet length based on whether the CRC needs
+ * to be stripped or if the packet is less than the minimum
+ * eth packet size.
+ */
+ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN)
+ len = ETHER_MIN_LEN - ETHER_CRC_LEN;
+ if (!(sc->esc_RCTL & E1000_RCTL_SECRC))
+ len += ETHER_CRC_LEN;
+ n = (len + bufsz - 1) / bufsz;
+
+ DPRINTF("packet read %d bytes, %d segs, head %d\r\n",
+ len, n, head);
+
+ /* Apply VLAN filter. */
+ tp = (uint16_t *)vec[0].iov_base + 6;
+ if ((sc->esc_RCTL & E1000_RCTL_VFE) &&
+ (ntohs(tp[0]) == sc->esc_VET)) {
+ tag = ntohs(tp[1]) & 0x0fff;
+ if ((sc->esc_fvlan[tag >> 5] &
+ (1 << (tag & 0x1f))) != 0) {
+ DPRINTF("known VLAN %d\r\n", tag);
+ } else {
+ DPRINTF("unknown VLAN %d\r\n", tag);
+ n = 0;
+ continue;
+ }
+ }
+
+ /* Update all consumed descriptors. */
+ for (i = 0; i < n - 1; i++) {
+ rxd = &sc->esc_rxdesc[(head + i) % size];
+ rxd->length = bufsz;
+ rxd->csum = 0;
+ rxd->errors = 0;
+ rxd->special = 0;
+ rxd->status = E1000_RXD_STAT_DD;
+ }
+ rxd = &sc->esc_rxdesc[(head + i) % size];
+ rxd->length = len % bufsz;
+ rxd->csum = 0;
+ rxd->errors = 0;
+ rxd->special = 0;
+ /* XXX signal no checksum for now */
+ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM |
+ E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD;
+
+ /* Schedule receive interrupts. */
+ if (len <= sc->esc_RSRPD) {
+ cause |= E1000_ICR_SRPD | E1000_ICR_RXT0;
+ } else {
+ /* XXX: RDRT and RADV timers should be here. */
+ cause |= E1000_ICR_RXT0;
+ }
+
+ head = (head + n) % size;
+ left -= n;
+ }
+
+done:
+ pthread_mutex_lock(&sc->esc_mtx);
+ sc->esc_rx_active = 0;
+ if (sc->esc_rx_enabled == 0)
+ pthread_cond_signal(&sc->esc_rx_cond);
+
+ sc->esc_RDH = head;
+ /* Respect E1000_RCTL_RDMTS */
+ left = (size + sc->esc_RDT - head) % size;
+ if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1)))
+ cause |= E1000_ICR_RXDMT0;
+ /* Assert all accumulated interrupts. */
+ if (cause != 0)
+ e82545_icr_assert(sc, cause);
+done1:
+ DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+ pthread_mutex_unlock(&sc->esc_mtx);
+}
+#endif
+
+static uint16_t
+e82545_carry(uint32_t sum)
+{
+
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ if (sum > 0xFFFF)
+ sum -= 0xFFFF;
+ return (sum);
+}
+
+static uint16_t
+#ifdef __FreeBSD__
+e82545_buf_checksum(uint8_t *buf, int len)
+#else
+e82545_buf_checksum(caddr_t buf, int len)
+#endif
+{
+ int i;
+ uint32_t sum = 0;
+
+ /* Checksum all the pairs of bytes first... */
+ for (i = 0; i < (len & ~1U); i += 2)
+ sum += *((u_int16_t *)(buf + i));
+
+ /*
+ * If there's a single byte left over, checksum it, too.
+ * Network byte order is big-endian, so the remaining byte is
+ * the high byte.
+ */
+ if (i < len)
+ sum += htons(buf[i] << 8);
+
+ return (e82545_carry(sum));
+}
+
+static uint16_t
+e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
+{
+ int now, odd;
+ uint32_t sum = 0, s;
+
+ /* Skip completely unneeded vectors. */
+ while (iovcnt > 0 && iov->iov_len <= off && off > 0) {
+ off -= iov->iov_len;
+ iov++;
+ iovcnt--;
+ }
+
+ /* Calculate checksum of requested range. */
+ odd = 0;
+ while (len > 0 && iovcnt > 0) {
+ now = MIN(len, iov->iov_len - off);
+ s = e82545_buf_checksum(iov->iov_base + off, now);
+ sum += odd ? (s << 8) : s;
+ odd ^= (now & 1);
+ len -= now;
+ off = 0;
+ iov++;
+ iovcnt--;
+ }
+
+ return (e82545_carry(sum));
+}
+
+/*
+ * Return the transmit descriptor type.
+ */
+int
+e82545_txdesc_type(uint32_t lower)
+{
+ int type;
+
+ type = 0;
+
+ if (lower & E1000_TXD_CMD_DEXT)
+ type = lower & E1000_TXD_MASK;
+
+ return (type);
+}
+
+static void
+e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck)
+{
+ uint16_t cksum;
+ int cklen;
+
+ DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n",
+ iovcnt, ck->ck_start, ck->ck_off, ck->ck_len);
+ cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX;
+ cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen);
+ *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum;
+}
+
+static void
+e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
+{
+
+ if (sc->esc_tapfd == -1)
+ return;
+
+ (void) writev(sc->esc_tapfd, iov, iovcnt);
+}
+
+static void
+e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+ uint16_t dsize, int *tdwb)
+{
+ union e1000_tx_udesc *dsc;
+
+ for ( ; head != tail; head = (head + 1) % dsize) {
+ dsc = &sc->esc_txdesc[head];
+ if (dsc->td.lower.data & E1000_TXD_CMD_RS) {
+ dsc->td.upper.data |= E1000_TXD_STAT_DD;
+ *tdwb = 1;
+ }
+ }
+}
+
+static int
+e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+ uint16_t dsize, uint16_t *rhead, int *tdwb)
+{
+#ifdef __FreeBSD__
+ uint8_t *hdr, *hdrp;
+#else
+ caddr_t hdr, hdrp;
+#endif
+ struct iovec iovb[I82545_MAX_TXSEGS + 2];
+ struct iovec tiov[I82545_MAX_TXSEGS + 2];
+ struct e1000_context_desc *cd;
+ struct ck_info ckinfo[2];
+ struct iovec *iov;
+ union e1000_tx_udesc *dsc;
+ int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso;
+ int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff;
+ uint32_t tcpsum, tcpseq;
+ uint16_t ipcs, tcpcs, ipid, ohead;
+
+ ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0;
+ iovcnt = 0;
+ tlen = 0;
+ ntype = 0;
+ tso = 0;
+ ohead = head;
+ hdr = NULL;
+
+ /* iovb[0/1] may be used for writable copy of headers. */
+ iov = &iovb[2];
+
+ for (desc = 0; ; desc++, head = (head + 1) % dsize) {
+ if (head == tail) {
+ *rhead = head;
+ return (0);
+ }
+ dsc = &sc->esc_txdesc[head];
+ dtype = e82545_txdesc_type(dsc->td.lower.data);
+
+ if (desc == 0) {
+ switch (dtype) {
+ case E1000_TXD_TYP_C:
+ DPRINTF("tx ctxt desc idx %d: %016jx "
+ "%08x%08x\r\n",
+ head, dsc->td.buffer_addr,
+ dsc->td.upper.data, dsc->td.lower.data);
+ /* Save context and return */
+ sc->esc_txctx = dsc->cd;
+ goto done;
+ case E1000_TXD_TYP_L:
+ DPRINTF("tx legacy desc idx %d: %08x%08x\r\n",
+ head, dsc->td.upper.data, dsc->td.lower.data);
+ /*
+ * legacy cksum start valid in first descriptor
+ */
+ ntype = dtype;
+ ckinfo[0].ck_start = dsc->td.upper.fields.css;
+ break;
+ case E1000_TXD_TYP_D:
+ DPRINTF("tx data desc idx %d: %08x%08x\r\n",
+ head, dsc->td.upper.data, dsc->td.lower.data);
+ ntype = dtype;
+ break;
+ default:
+ break;
+ }
+ } else {
+ /* Descriptor type must be consistent */
+ assert(dtype == ntype);
+ DPRINTF("tx next desc idx %d: %08x%08x\r\n",
+ head, dsc->td.upper.data, dsc->td.lower.data);
+ }
+
+ len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length :
+ dsc->dd.lower.data & 0xFFFFF;
+
+ if (len > 0) {
+ /* Strip checksum supplied by guest. */
+ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 &&
+ (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0)
+ len -= 2;
+ tlen += len;
+ if (iovcnt < I82545_MAX_TXSEGS) {
+ iov[iovcnt].iov_base = paddr_guest2host(
+ sc->esc_ctx, dsc->td.buffer_addr, len);
+ iov[iovcnt].iov_len = len;
+ }
+ iovcnt++;
+ }
+
+ /*
+ * Pull out info that is valid in the final descriptor
+ * and exit descriptor loop.
+ */
+ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) {
+ if (dtype == E1000_TXD_TYP_L) {
+ if (dsc->td.lower.data & E1000_TXD_CMD_IC) {
+ ckinfo[0].ck_valid = 1;
+ ckinfo[0].ck_off =
+ dsc->td.lower.flags.cso;
+ ckinfo[0].ck_len = 0;
+ }
+ } else {
+ cd = &sc->esc_txctx;
+ if (dsc->dd.lower.data & E1000_TXD_CMD_TSE)
+ tso = 1;
+ if (dsc->dd.upper.fields.popts &
+ E1000_TXD_POPTS_IXSM)
+ ckinfo[0].ck_valid = 1;
+ if (dsc->dd.upper.fields.popts &
+ E1000_TXD_POPTS_IXSM || tso) {
+ ckinfo[0].ck_start =
+ cd->lower_setup.ip_fields.ipcss;
+ ckinfo[0].ck_off =
+ cd->lower_setup.ip_fields.ipcso;
+ ckinfo[0].ck_len =
+ cd->lower_setup.ip_fields.ipcse;
+ }
+ if (dsc->dd.upper.fields.popts &
+ E1000_TXD_POPTS_TXSM)
+ ckinfo[1].ck_valid = 1;
+ if (dsc->dd.upper.fields.popts &
+ E1000_TXD_POPTS_TXSM || tso) {
+ ckinfo[1].ck_start =
+ cd->upper_setup.tcp_fields.tucss;
+ ckinfo[1].ck_off =
+ cd->upper_setup.tcp_fields.tucso;
+ ckinfo[1].ck_len =
+ cd->upper_setup.tcp_fields.tucse;
+ }
+ }
+ break;
+ }
+ }
+
+ if (iovcnt > I82545_MAX_TXSEGS) {
+ WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n",
+ iovcnt, I82545_MAX_TXSEGS);
+ goto done;
+ }
+
+ hdrlen = vlen = 0;
+ /* Estimate writable space for VLAN header insertion. */
+ if ((sc->esc_CTRL & E1000_CTRL_VME) &&
+ (dsc->td.lower.data & E1000_TXD_CMD_VLE)) {
+ hdrlen = ETHER_ADDR_LEN*2;
+ vlen = ETHER_VLAN_ENCAP_LEN;
+ }
+ if (!tso) {
+ /* Estimate required writable space for checksums. */
+ if (ckinfo[0].ck_valid)
+ hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2);
+ if (ckinfo[1].ck_valid)
+ hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2);
+ /* Round up writable space to the first vector. */
+ if (hdrlen != 0 && iov[0].iov_len > hdrlen &&
+ iov[0].iov_len < hdrlen + 100)
+ hdrlen = iov[0].iov_len;
+ } else {
+ /* In case of TSO header length provided by software. */
+ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len;
+ }
+
+ /* Allocate, fill and prepend writable header vector. */
+ if (hdrlen != 0) {
+ hdr = __builtin_alloca(hdrlen + vlen);
+ hdr += vlen;
+ for (left = hdrlen, hdrp = hdr; left > 0;
+ left -= now, hdrp += now) {
+ now = MIN(left, iov->iov_len);
+ memcpy(hdrp, iov->iov_base, now);
+ iov->iov_base += now;
+ iov->iov_len -= now;
+ if (iov->iov_len == 0) {
+ iov++;
+ iovcnt--;
+ }
+ }
+ iov--;
+ iovcnt++;
+ iov->iov_base = hdr;
+ iov->iov_len = hdrlen;
+ }
+
+ /* Insert VLAN tag. */
+ if (vlen != 0) {
+ hdr -= ETHER_VLAN_ENCAP_LEN;
+ memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2);
+ hdrlen += ETHER_VLAN_ENCAP_LEN;
+ hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8;
+ hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff;
+ hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8;
+ hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff;
+ iov->iov_base = hdr;
+ iov->iov_len += ETHER_VLAN_ENCAP_LEN;
+ /* Correct checksum offsets after VLAN tag insertion. */
+ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN;
+ ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN;
+ if (ckinfo[0].ck_len != 0)
+ ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN;
+ ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN;
+ ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN;
+ if (ckinfo[1].ck_len != 0)
+ ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN;
+ }
+
+ /* Simple non-TSO case. */
+ if (!tso) {
+ /* Calculate checksums and transmit. */
+ if (ckinfo[0].ck_valid)
+ e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]);
+ if (ckinfo[1].ck_valid)
+ e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]);
+ e82545_transmit_backend(sc, iov, iovcnt);
+ goto done;
+ }
+
+ /* Doing TSO. */
+ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0;
+ mss = sc->esc_txctx.tcp_seg_setup.fields.mss;
+ paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff);
+ DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n",
+ tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt);
+ ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]);
+ tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
+ ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off];
+ tcpcs = 0;
+ if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */
+ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off];
+ pv = 1;
+ pvoff = 0;
+ for (seg = 0, left = paylen; left > 0; seg++, left -= now) {
+ now = MIN(left, mss);
+
+ /* Construct IOVs for the segment. */
+ /* Include whole original header. */
+ tiov[0].iov_base = hdr;
+ tiov[0].iov_len = hdrlen;
+ tiovcnt = 1;
+ /* Include respective part of payload IOV. */
+ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) {
+ nnow = MIN(nleft, iov[pv].iov_len - pvoff);
+ tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff;
+ tiov[tiovcnt++].iov_len = nnow;
+ if (pvoff + nnow == iov[pv].iov_len) {
+ pv++;
+ pvoff = 0;
+ } else
+ pvoff += nnow;
+ }
+ DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n",
+ seg, hdrlen, now, tiovcnt);
+
+ /* Update IP header. */
+ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) {
+ /* IPv4 -- set length and ID */
+ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] =
+ htons(hdrlen - ckinfo[0].ck_start + now);
+ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+ htons(ipid + seg);
+ } else {
+ /* IPv6 -- set length */
+ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+ htons(hdrlen - ckinfo[0].ck_start - 40 +
+ now);
+ }
+
+ /* Update pseudo-header checksum. */
+ tcpsum = tcpcs;
+ tcpsum += htons(hdrlen - ckinfo[1].ck_start + now);
+
+ /* Update TCP/UDP headers. */
+ if (tcp) {
+ /* Update sequence number and FIN/PUSH flags. */
+ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+ htonl(tcpseq + paylen - left);
+ if (now < left) {
+ hdr[ckinfo[1].ck_start + 13] &=
+ ~(TH_FIN | TH_PUSH);
+ }
+ } else {
+ /* Update payload length. */
+ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+ hdrlen - ckinfo[1].ck_start + now;
+ }
+
+ /* Calculate checksums and transmit. */
+ if (ckinfo[0].ck_valid) {
+ *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs;
+ e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]);
+ }
+ if (ckinfo[1].ck_valid) {
+ *(uint16_t *)&hdr[ckinfo[1].ck_off] =
+ e82545_carry(tcpsum);
+ e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]);
+ }
+ e82545_transmit_backend(sc, tiov, tiovcnt);
+ }
+
+done:
+ head = (head + 1) % dsize;
+ e82545_transmit_done(sc, ohead, head, dsize, tdwb);
+
+ *rhead = head;
+ return (desc + 1);
+}
+
+static void
+e82545_tx_run(struct e82545_softc *sc)
+{
+ uint32_t cause;
+ uint16_t head, rhead, tail, size;
+ int lim, tdwb, sent;
+
+ head = sc->esc_TDH;
+ tail = sc->esc_TDT;
+ size = sc->esc_TDLEN / 16;
+ DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n",
+ sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+
+ pthread_mutex_unlock(&sc->esc_mtx);
+ rhead = head;
+ tdwb = 0;
+ for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) {
+ sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb);
+ if (sent == 0)
+ break;
+ head = rhead;
+ }
+ pthread_mutex_lock(&sc->esc_mtx);
+
+ sc->esc_TDH = head;
+ sc->esc_TDHr = rhead;
+ cause = 0;
+ if (tdwb)
+ cause |= E1000_ICR_TXDW;
+ if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT)
+ cause |= E1000_ICR_TXQE;
+ if (cause)
+ e82545_icr_assert(sc, cause);
+
+ DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n",
+ sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+}
+
+static void *
+e82545_tx_thread(void *param)
+{
+ struct e82545_softc *sc = param;
+
+ pthread_mutex_lock(&sc->esc_mtx);
+ for (;;) {
+ while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) {
+ if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT)
+ break;
+ sc->esc_tx_active = 0;
+ if (sc->esc_tx_enabled == 0)
+ pthread_cond_signal(&sc->esc_tx_cond);
+ pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+ }
+ sc->esc_tx_active = 1;
+
+ /* Process some tx descriptors. Lock dropped inside. */
+ e82545_tx_run(sc);
+ }
+#ifndef __FreeBSD__
+ return (NULL);
+#endif
+}
+
+static void
+e82545_tx_start(struct e82545_softc *sc)
+{
+
+ if (sc->esc_tx_active == 0)
+ pthread_cond_signal(&sc->esc_tx_cond);
+}
+
+static void
+e82545_tx_enable(struct e82545_softc *sc)
+{
+
+ sc->esc_tx_enabled = 1;
+}
+
+static void
+e82545_tx_disable(struct e82545_softc *sc)
+{
+
+ sc->esc_tx_enabled = 0;
+ while (sc->esc_tx_active)
+ pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_rx_enable(struct e82545_softc *sc)
+{
+
+ sc->esc_rx_enabled = 1;
+}
+
+static void
+e82545_rx_disable(struct e82545_softc *sc)
+{
+
+ sc->esc_rx_enabled = 0;
+ while (sc->esc_rx_active)
+ pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval)
+{
+ struct eth_uni *eu;
+ int idx;
+
+ idx = reg >> 1;
+ assert(idx < 15);
+
+ eu = &sc->esc_uni[idx];
+
+ if (reg & 0x1) {
+ /* RAH */
+ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV);
+ eu->eu_addrsel = (wval >> 16) & 0x3;
+ eu->eu_eth.octet[5] = wval >> 8;
+ eu->eu_eth.octet[4] = wval;
+ } else {
+ /* RAL */
+ eu->eu_eth.octet[3] = wval >> 24;
+ eu->eu_eth.octet[2] = wval >> 16;
+ eu->eu_eth.octet[1] = wval >> 8;
+ eu->eu_eth.octet[0] = wval;
+ }
+}
+
+static uint32_t
+e82545_read_ra(struct e82545_softc *sc, int reg)
+{
+ struct eth_uni *eu;
+ uint32_t retval;
+ int idx;
+
+ idx = reg >> 1;
+ assert(idx < 15);
+
+ eu = &sc->esc_uni[idx];
+
+ if (reg & 0x1) {
+ /* RAH */
+ retval = (eu->eu_valid << 31) |
+ (eu->eu_addrsel << 16) |
+ (eu->eu_eth.octet[5] << 8) |
+ eu->eu_eth.octet[4];
+ } else {
+ /* RAL */
+ retval = (eu->eu_eth.octet[3] << 24) |
+ (eu->eu_eth.octet[2] << 16) |
+ (eu->eu_eth.octet[1] << 8) |
+ eu->eu_eth.octet[0];
+ }
+
+ return (retval);
+}
+
+static void
+e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+ int ridx;
+
+ if (offset & 0x3) {
+ DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value);
+ return;
+ }
+ DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value);
+
+ switch (offset) {
+ case E1000_CTRL:
+ case E1000_CTRL_DUP:
+ e82545_devctl(sc, value);
+ break;
+ case E1000_FCAL:
+ sc->esc_FCAL = value;
+ break;
+ case E1000_FCAH:
+ sc->esc_FCAH = value & ~0xFFFF0000;
+ break;
+ case E1000_FCT:
+ sc->esc_FCT = value & ~0xFFFF0000;
+ break;
+ case E1000_VET:
+ sc->esc_VET = value & ~0xFFFF0000;
+ break;
+ case E1000_FCTTV:
+ sc->esc_FCTTV = value & ~0xFFFF0000;
+ break;
+ case E1000_LEDCTL:
+ sc->esc_LEDCTL = value & ~0x30303000;
+ break;
+ case E1000_PBA:
+ sc->esc_PBA = value & 0x0000FF80;
+ break;
+ case E1000_ICR:
+ case E1000_ITR:
+ case E1000_ICS:
+ case E1000_IMS:
+ case E1000_IMC:
+ e82545_intr_write(sc, offset, value);
+ break;
+ case E1000_RCTL:
+ e82545_rx_ctl(sc, value);
+ break;
+ case E1000_FCRTL:
+ sc->esc_FCRTL = value & ~0xFFFF0007;
+ break;
+ case E1000_FCRTH:
+ sc->esc_FCRTH = value & ~0xFFFF0007;
+ break;
+ case E1000_RDBAL(0):
+ sc->esc_RDBAL = value & ~0xF;
+ if (sc->esc_rx_enabled) {
+ /* Apparently legal: update cached address */
+ e82545_rx_update_rdba(sc);
+ }
+ break;
+ case E1000_RDBAH(0):
+ assert(!sc->esc_rx_enabled);
+ sc->esc_RDBAH = value;
+ break;
+ case E1000_RDLEN(0):
+ assert(!sc->esc_rx_enabled);
+ sc->esc_RDLEN = value & ~0xFFF0007F;
+ break;
+ case E1000_RDH(0):
+ /* XXX should only ever be zero ? Range check ? */
+ sc->esc_RDH = value;
+ break;
+ case E1000_RDT(0):
+ /* XXX if this opens up the rx ring, do something ? */
+ sc->esc_RDT = value;
+ break;
+ case E1000_RDTR:
+ /* ignore FPD bit 31 */
+ sc->esc_RDTR = value & ~0xFFFF0000;
+ break;
+ case E1000_RXDCTL(0):
+ sc->esc_RXDCTL = value & ~0xFEC0C0C0;
+ break;
+ case E1000_RADV:
+ sc->esc_RADV = value & ~0xFFFF0000;
+ break;
+ case E1000_RSRPD:
+ sc->esc_RSRPD = value & ~0xFFFFF000;
+ break;
+ case E1000_RXCSUM:
+ sc->esc_RXCSUM = value & ~0xFFFFF800;
+ break;
+ case E1000_TXCW:
+ sc->esc_TXCW = value & ~0x3FFF0000;
+ break;
+ case E1000_TCTL:
+ e82545_tx_ctl(sc, value);
+ break;
+ case E1000_TIPG:
+ sc->esc_TIPG = value;
+ break;
+ case E1000_AIT:
+ sc->esc_AIT = value;
+ break;
+ case E1000_TDBAL(0):
+ sc->esc_TDBAL = value & ~0xF;
+ if (sc->esc_tx_enabled) {
+ /* Apparently legal */
+ e82545_tx_update_tdba(sc);
+ }
+ break;
+ case E1000_TDBAH(0):
+ //assert(!sc->esc_tx_enabled);
+ sc->esc_TDBAH = value;
+ break;
+ case E1000_TDLEN(0):
+ //assert(!sc->esc_tx_enabled);
+ sc->esc_TDLEN = value & ~0xFFF0007F;
+ break;
+ case E1000_TDH(0):
+ //assert(!sc->esc_tx_enabled);
+ /* XXX should only ever be zero ? Range check ? */
+ sc->esc_TDHr = sc->esc_TDH = value;
+ break;
+ case E1000_TDT(0):
+ /* XXX range check ? */
+ sc->esc_TDT = value;
+ if (sc->esc_tx_enabled)
+ e82545_tx_start(sc);
+ break;
+ case E1000_TIDV:
+ sc->esc_TIDV = value & ~0xFFFF0000;
+ break;
+ case E1000_TXDCTL(0):
+ //assert(!sc->esc_tx_enabled);
+ sc->esc_TXDCTL = value & ~0xC0C0C0;
+ break;
+ case E1000_TADV:
+ sc->esc_TADV = value & ~0xFFFF0000;
+ break;
+ case E1000_RAL(0) ... E1000_RAH(15):
+ /* convert to u32 offset */
+ ridx = (offset - E1000_RAL(0)) >> 2;
+ e82545_write_ra(sc, ridx, value);
+ break;
+ case E1000_MTA ... (E1000_MTA + (127*4)):
+ sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value;
+ break;
+ case E1000_VFTA ... (E1000_VFTA + (127*4)):
+ sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value;
+ break;
+ case E1000_EECD:
+ {
+ //DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value);
+ /* edge triggered low->high */
+ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ?
+ 0 : (value & E1000_EECD_SK));
+ uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS|
+ E1000_EECD_DI|E1000_EECD_REQ);
+ sc->eeprom_control &= ~eecd_mask;
+ sc->eeprom_control |= (value & eecd_mask);
+ /* grant/revoke immediately */
+ if (value & E1000_EECD_REQ) {
+ sc->eeprom_control |= E1000_EECD_GNT;
+ } else {
+ sc->eeprom_control &= ~E1000_EECD_GNT;
+ }
+ if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) {
+ e82545_eecd_strobe(sc);
+ }
+ return;
+ }
+ case E1000_MDIC:
+ {
+ uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >>
+ E1000_MDIC_REG_SHIFT);
+ uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >>
+ E1000_MDIC_PHY_SHIFT);
+ sc->mdi_control =
+ (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST));
+ if ((value & E1000_MDIC_READY) != 0) {
+ DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value);
+ return;
+ }
+ switch (value & E82545_MDIC_OP_MASK) {
+ case E1000_MDIC_OP_READ:
+ sc->mdi_control &= ~E82545_MDIC_DATA_MASK;
+ sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr);
+ break;
+ case E1000_MDIC_OP_WRITE:
+ e82545_write_mdi(sc, reg_addr, phy_addr,
+ value & E82545_MDIC_DATA_MASK);
+ break;
+ default:
+ DPRINTF("Unknown MDIC op: 0x%x\r\n", value);
+ return;
+ }
+ /* TODO: barrier? */
+ sc->mdi_control |= E1000_MDIC_READY;
+ if (value & E82545_MDIC_IE) {
+ // TODO: generate interrupt
+ }
+ return;
+ }
+ case E1000_MANC:
+ case E1000_STATUS:
+ return;
+ default:
+ DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value);
+ return;
+ }
+}
+
+static uint32_t
+e82545_read_register(struct e82545_softc *sc, uint32_t offset)
+{
+ uint32_t retval;
+ int ridx;
+
+ if (offset & 0x3) {
+ DPRINTF("Unaligned register read offset:0x%x\r\n", offset);
+ return 0;
+ }
+
+ DPRINTF("Register read: 0x%x\r\n", offset);
+
+ switch (offset) {
+ case E1000_CTRL:
+ retval = sc->esc_CTRL;
+ break;
+ case E1000_STATUS:
+ retval = E1000_STATUS_FD | E1000_STATUS_LU |
+ E1000_STATUS_SPEED_1000;
+ break;
+ case E1000_FCAL:
+ retval = sc->esc_FCAL;
+ break;
+ case E1000_FCAH:
+ retval = sc->esc_FCAH;
+ break;
+ case E1000_FCT:
+ retval = sc->esc_FCT;
+ break;
+ case E1000_VET:
+ retval = sc->esc_VET;
+ break;
+ case E1000_FCTTV:
+ retval = sc->esc_FCTTV;
+ break;
+ case E1000_LEDCTL:
+ retval = sc->esc_LEDCTL;
+ break;
+ case E1000_PBA:
+ retval = sc->esc_PBA;
+ break;
+ case E1000_ICR:
+ case E1000_ITR:
+ case E1000_ICS:
+ case E1000_IMS:
+ case E1000_IMC:
+ retval = e82545_intr_read(sc, offset);
+ break;
+ case E1000_RCTL:
+ retval = sc->esc_RCTL;
+ break;
+ case E1000_FCRTL:
+ retval = sc->esc_FCRTL;
+ break;
+ case E1000_FCRTH:
+ retval = sc->esc_FCRTH;
+ break;
+ case E1000_RDBAL(0):
+ retval = sc->esc_RDBAL;
+ break;
+ case E1000_RDBAH(0):
+ retval = sc->esc_RDBAH;
+ break;
+ case E1000_RDLEN(0):
+ retval = sc->esc_RDLEN;
+ break;
+ case E1000_RDH(0):
+ retval = sc->esc_RDH;
+ break;
+ case E1000_RDT(0):
+ retval = sc->esc_RDT;
+ break;
+ case E1000_RDTR:
+ retval = sc->esc_RDTR;
+ break;
+ case E1000_RXDCTL(0):
+ retval = sc->esc_RXDCTL;
+ break;
+ case E1000_RADV:
+ retval = sc->esc_RADV;
+ break;
+ case E1000_RSRPD:
+ retval = sc->esc_RSRPD;
+ break;
+ case E1000_RXCSUM:
+ retval = sc->esc_RXCSUM;
+ break;
+ case E1000_TXCW:
+ retval = sc->esc_TXCW;
+ break;
+ case E1000_TCTL:
+ retval = sc->esc_TCTL;
+ break;
+ case E1000_TIPG:
+ retval = sc->esc_TIPG;
+ break;
+ case E1000_AIT:
+ retval = sc->esc_AIT;
+ break;
+ case E1000_TDBAL(0):
+ retval = sc->esc_TDBAL;
+ break;
+ case E1000_TDBAH(0):
+ retval = sc->esc_TDBAH;
+ break;
+ case E1000_TDLEN(0):
+ retval = sc->esc_TDLEN;
+ break;
+ case E1000_TDH(0):
+ retval = sc->esc_TDH;
+ break;
+ case E1000_TDT(0):
+ retval = sc->esc_TDT;
+ break;
+ case E1000_TIDV:
+ retval = sc->esc_TIDV;
+ break;
+ case E1000_TXDCTL(0):
+ retval = sc->esc_TXDCTL;
+ break;
+ case E1000_TADV:
+ retval = sc->esc_TADV;
+ break;
+ case E1000_RAL(0) ... E1000_RAH(15):
+ /* convert to u32 offset */
+ ridx = (offset - E1000_RAL(0)) >> 2;
+ retval = e82545_read_ra(sc, ridx);
+ break;
+ case E1000_MTA ... (E1000_MTA + (127*4)):
+ retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2];
+ break;
+ case E1000_VFTA ... (E1000_VFTA + (127*4)):
+ retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2];
+ break;
+ case E1000_EECD:
+ //DPRINTF("EECD read %x\r\n", sc->eeprom_control);
+ retval = sc->eeprom_control;
+ break;
+ case E1000_MDIC:
+ retval = sc->mdi_control;
+ break;
+ case E1000_MANC:
+ retval = 0;
+ break;
+ /* stats that we emulate. */
+ case E1000_MPC:
+ retval = sc->missed_pkt_count;
+ break;
+ case E1000_PRC64:
+ retval = sc->pkt_rx_by_size[0];
+ break;
+ case E1000_PRC127:
+ retval = sc->pkt_rx_by_size[1];
+ break;
+ case E1000_PRC255:
+ retval = sc->pkt_rx_by_size[2];
+ break;
+ case E1000_PRC511:
+ retval = sc->pkt_rx_by_size[3];
+ break;
+ case E1000_PRC1023:
+ retval = sc->pkt_rx_by_size[4];
+ break;
+ case E1000_PRC1522:
+ retval = sc->pkt_rx_by_size[5];
+ break;
+ case E1000_GPRC:
+ retval = sc->good_pkt_rx_count;
+ break;
+ case E1000_BPRC:
+ retval = sc->bcast_pkt_rx_count;
+ break;
+ case E1000_MPRC:
+ retval = sc->mcast_pkt_rx_count;
+ break;
+ case E1000_GPTC:
+ case E1000_TPT:
+ retval = sc->good_pkt_tx_count;
+ break;
+ case E1000_GORCL:
+ retval = (uint32_t)sc->good_octets_rx;
+ break;
+ case E1000_GORCH:
+ retval = (uint32_t)(sc->good_octets_rx >> 32);
+ break;
+ case E1000_TOTL:
+ case E1000_GOTCL:
+ retval = (uint32_t)sc->good_octets_tx;
+ break;
+ case E1000_TOTH:
+ case E1000_GOTCH:
+ retval = (uint32_t)(sc->good_octets_tx >> 32);
+ break;
+ case E1000_ROC:
+ retval = sc->oversize_rx_count;
+ break;
+ case E1000_TORL:
+ retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets);
+ break;
+ case E1000_TORH:
+ retval = (uint32_t)((sc->good_octets_rx +
+ sc->missed_octets) >> 32);
+ break;
+ case E1000_TPR:
+ retval = sc->good_pkt_rx_count + sc->missed_pkt_count +
+ sc->oversize_rx_count;
+ break;
+ case E1000_PTC64:
+ retval = sc->pkt_tx_by_size[0];
+ break;
+ case E1000_PTC127:
+ retval = sc->pkt_tx_by_size[1];
+ break;
+ case E1000_PTC255:
+ retval = sc->pkt_tx_by_size[2];
+ break;
+ case E1000_PTC511:
+ retval = sc->pkt_tx_by_size[3];
+ break;
+ case E1000_PTC1023:
+ retval = sc->pkt_tx_by_size[4];
+ break;
+ case E1000_PTC1522:
+ retval = sc->pkt_tx_by_size[5];
+ break;
+ case E1000_MPTC:
+ retval = sc->mcast_pkt_tx_count;
+ break;
+ case E1000_BPTC:
+ retval = sc->bcast_pkt_tx_count;
+ break;
+ case E1000_TSCTC:
+ retval = sc->tso_tx_count;
+ break;
+ /* stats that are always 0. */
+ case E1000_CRCERRS:
+ case E1000_ALGNERRC:
+ case E1000_SYMERRS:
+ case E1000_RXERRC:
+ case E1000_SCC:
+ case E1000_ECOL:
+ case E1000_MCC:
+ case E1000_LATECOL:
+ case E1000_COLC:
+ case E1000_DC:
+ case E1000_TNCRS:
+ case E1000_SEC:
+ case E1000_CEXTERR:
+ case E1000_RLEC:
+ case E1000_XONRXC:
+ case E1000_XONTXC:
+ case E1000_XOFFRXC:
+ case E1000_XOFFTXC:
+ case E1000_FCRUC:
+ case E1000_RNBC:
+ case E1000_RUC:
+ case E1000_RFC:
+ case E1000_RJC:
+ case E1000_MGTPRC:
+ case E1000_MGTPDC:
+ case E1000_MGTPTC:
+ case E1000_TSCTFC:
+ retval = 0;
+ break;
+ default:
+ DPRINTF("Unknown read register: 0x%x\r\n", offset);
+ retval = 0;
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ struct e82545_softc *sc;
+
+ //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size);
+
+ sc = pi->pi_arg;
+
+ pthread_mutex_lock(&sc->esc_mtx);
+
+ switch (baridx) {
+ case E82545_BAR_IO:
+ switch (offset) {
+ case E82545_IOADDR:
+ if (size != 4) {
+ DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value);
+ } else
+ sc->io_addr = (uint32_t)value;
+ break;
+ case E82545_IODATA:
+ if (size != 4) {
+ DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value);
+ } else if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+ DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value);
+ } else
+ e82545_write_register(sc, sc->io_addr,
+ (uint32_t)value);
+ break;
+ default:
+ DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size);
+ break;
+ }
+ break;
+ case E82545_BAR_REGISTER:
+ if (size != 4) {
+ DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value);
+ } else
+ e82545_write_register(sc, (uint32_t)offset,
+ (uint32_t)value);
+ break;
+ default:
+ DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n",
+ baridx, offset, value, size);
+ }
+
+ pthread_mutex_unlock(&sc->esc_mtx);
+}
+
+static uint64_t
+e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct e82545_softc *sc;
+ uint64_t retval;
+
+ //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size);
+ sc = pi->pi_arg;
+ retval = 0;
+
+ pthread_mutex_lock(&sc->esc_mtx);
+
+ switch (baridx) {
+ case E82545_BAR_IO:
+ switch (offset) {
+ case E82545_IOADDR:
+ if (size != 4) {
+ DPRINTF("Wrong io addr read sz:%d\r\n", size);
+ } else
+ retval = sc->io_addr;
+ break;
+ case E82545_IODATA:
+ if (size != 4) {
+ DPRINTF("Wrong io data read sz:%d\r\n", size);
+ }
+ if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+ DPRINTF("Non-register io read addr:0x%x\r\n",
+ sc->io_addr);
+ } else
+ retval = e82545_read_register(sc, sc->io_addr);
+ break;
+ default:
+ DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n",
+ offset, size);
+ break;
+ }
+ break;
+ case E82545_BAR_REGISTER:
+ if (size != 4) {
+ DPRINTF("Wrong register read size:%d offset:0x%lx\r\n",
+ size, offset);
+ } else
+ retval = e82545_read_register(sc, (uint32_t)offset);
+ break;
+ default:
+ DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n",
+ baridx, offset, size);
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->esc_mtx);
+
+ return (retval);
+}
+
+static void
+e82545_reset(struct e82545_softc *sc, int drvr)
+{
+ int i;
+
+ e82545_rx_disable(sc);
+ e82545_tx_disable(sc);
+
+ /* clear outstanding interrupts */
+ if (sc->esc_irq_asserted)
+ pci_lintr_deassert(sc->esc_pi);
+
+ /* misc */
+ if (!drvr) {
+ sc->esc_FCAL = 0;
+ sc->esc_FCAH = 0;
+ sc->esc_FCT = 0;
+ sc->esc_VET = 0;
+ sc->esc_FCTTV = 0;
+ }
+ sc->esc_LEDCTL = 0x07061302;
+ sc->esc_PBA = 0x00100030;
+
+ /* start nvm in opcode mode. */
+ sc->nvm_opaddr = 0;
+ sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+ sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+ sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN;
+ e82545_init_eeprom(sc);
+
+ /* interrupt */
+ sc->esc_ICR = 0;
+ sc->esc_ITR = 250;
+ sc->esc_ICS = 0;
+ sc->esc_IMS = 0;
+ sc->esc_IMC = 0;
+
+ /* L2 filters */
+ if (!drvr) {
+ memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan));
+ memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast));
+ memset(sc->esc_uni, 0, sizeof(sc->esc_uni));
+
+ /* XXX not necessary on 82545 ?? */
+ sc->esc_uni[0].eu_valid = 1;
+ memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet,
+ ETHER_ADDR_LEN);
+ } else {
+ /* Clear RAH valid bits */
+ for (i = 0; i < 16; i++)
+ sc->esc_uni[i].eu_valid = 0;
+ }
+
+ /* receive */
+ if (!drvr) {
+ sc->esc_RDBAL = 0;
+ sc->esc_RDBAH = 0;
+ }
+ sc->esc_RCTL = 0;
+ sc->esc_FCRTL = 0;
+ sc->esc_FCRTH = 0;
+ sc->esc_RDLEN = 0;
+ sc->esc_RDH = 0;
+ sc->esc_RDT = 0;
+ sc->esc_RDTR = 0;
+ sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */
+ sc->esc_RADV = 0;
+ sc->esc_RXCSUM = 0;
+
+ /* transmit */
+ if (!drvr) {
+ sc->esc_TDBAL = 0;
+ sc->esc_TDBAH = 0;
+ sc->esc_TIPG = 0;
+ sc->esc_AIT = 0;
+ sc->esc_TIDV = 0;
+ sc->esc_TADV = 0;
+ }
+ sc->esc_tdba = 0;
+ sc->esc_txdesc = NULL;
+ sc->esc_TXCW = 0;
+ sc->esc_TCTL = 0;
+ sc->esc_TDLEN = 0;
+ sc->esc_TDT = 0;
+ sc->esc_TDHr = sc->esc_TDH = 0;
+ sc->esc_TXDCTL = 0;
+}
+
+static void
+e82545_open_tap(struct e82545_softc *sc, char *opts)
+{
+ char tbuf[80];
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ if (opts == NULL) {
+ sc->esc_tapfd = -1;
+ return;
+ }
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, opts, sizeof(tbuf));
+
+ sc->esc_tapfd = open(tbuf, O_RDWR);
+ if (sc->esc_tapfd == -1) {
+ DPRINTF("unable to open tap device %s\n", opts);
+ exit(4);
+ }
+
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF("tap device O_NONBLOCK failed: %d\n", errno);
+ close(sc->esc_tapfd);
+ sc->esc_tapfd = -1;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(sc->esc_tapfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+#ifdef __FreeBSD__
+ sc->esc_mevp = mevent_add(sc->esc_tapfd,
+ EVF_READ,
+ e82545_tap_callback,
+ sc);
+ if (sc->esc_mevp == NULL) {
+ DPRINTF("Could not register mevent %d\n", EVF_READ);
+ close(sc->esc_tapfd);
+ sc->esc_tapfd = -1;
+ }
+#endif
+}
+
+static int
+e82545_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+ struct ether_addr *ea;
+ char *tmpstr;
+ char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+ tmpstr = strsep(&mac_str,"=");
+ if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+ ea = ether_aton(mac_str);
+ if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+ memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+ fprintf(stderr, "Invalid MAC %s\n", mac_str);
+ return (1);
+ } else
+ memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+ }
+ return (0);
+}
+
+static int
+e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ DPRINTF("Loading with options: %s\r\n", opts);
+
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+ struct e82545_softc *sc;
+ char *devname;
+ char *vtopts;
+ int mac_provided;
+
+ /* Setup our softc */
+ sc = calloc(1, sizeof(*sc));
+
+ pi->pi_arg = sc;
+ sc->esc_pi = pi;
+ sc->esc_ctx = ctx;
+
+ pthread_mutex_init(&sc->esc_mtx, NULL);
+ pthread_cond_init(&sc->esc_rx_cond, NULL);
+ pthread_cond_init(&sc->esc_tx_cond, NULL);
+ pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc);
+ snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot,
+ pi->pi_func);
+ pthread_set_name_np(sc->esc_tx_tid, nstr);
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL);
+
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+ pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1);
+
+ /* TODO: this card also supports msi, but the freebsd driver for it
+ * does not, so I have not implemented it. */
+ pci_lintr_request(pi);
+
+ pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32,
+ E82545_BAR_REGISTER_LEN);
+ pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32,
+ E82545_BAR_FLASH_LEN);
+ pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO,
+ E82545_BAR_IO_LEN);
+
+ /*
+ * Attempt to open the tap device and read the MAC address
+ * if specified. Copied from virtio-net, slightly modified.
+ */
+ mac_provided = 0;
+ sc->esc_tapfd = -1;
+ if (opts != NULL) {
+ int err;
+
+ devname = vtopts = strdup(opts);
+ (void) strsep(&vtopts, ",");
+
+ if (vtopts != NULL) {
+ err = e82545_parsemac(vtopts, sc->esc_mac.octet);
+ if (err != 0) {
+ free(devname);
+ return (err);
+ }
+ mac_provided = 1;
+ }
+
+ if (strncmp(devname, "tap", 3) == 0 ||
+ strncmp(devname, "vmnet", 5) == 0)
+ e82545_open_tap(sc, devname);
+
+ free(devname);
+ }
+
+ /*
+ * The default MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the PCI slot/func number and dev name
+ */
+ if (!mac_provided) {
+ snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+ pi->pi_func, vmname);
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->esc_mac.octet[0] = 0x00;
+ sc->esc_mac.octet[1] = 0xa0;
+ sc->esc_mac.octet[2] = 0x98;
+ sc->esc_mac.octet[3] = digest[0];
+ sc->esc_mac.octet[4] = digest[1];
+ sc->esc_mac.octet[5] = digest[2];
+ }
+
+ /* H/w initiated reset */
+ e82545_reset(sc, 0);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_e82545 = {
+ .pe_emu = "e1000",
+ .pe_init = e82545_init,
+ .pe_barwrite = e82545_write,
+ .pe_barread = e82545_read
+};
+PCI_EMUL_SET(pci_de_e82545);
+
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
new file mode 100644
index 0000000000..03db632e37
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -0,0 +1,2141 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "inout.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CONF1_ENABLE 0x80000000ul
+
+#define MAXBUSES (PCI_BUSMAX + 1)
+#define MAXSLOTS (PCI_SLOTMAX + 1)
+#define MAXFUNCS (PCI_FUNCMAX + 1)
+
+struct funcinfo {
+ char *fi_name;
+ char *fi_param;
+ struct pci_devinst *fi_devi;
+};
+
+struct intxinfo {
+ int ii_count;
+ int ii_pirq_pin;
+ int ii_ioapic_irq;
+};
+
+struct slotinfo {
+ struct intxinfo si_intpins[4];
+ struct funcinfo si_funcs[MAXFUNCS];
+};
+
+struct businfo {
+ uint16_t iobase, iolimit; /* I/O window */
+ uint32_t membase32, memlimit32; /* mmio window below 4GB */
+ uint64_t membase64, memlimit64; /* mmio window above 4GB */
+ struct slotinfo slotinfo[MAXSLOTS];
+};
+
+static struct businfo *pci_businfo[MAXBUSES];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
+#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static struct pci_devemu *pci_emul_finddev(char *name);
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+ int func, int coff, int bytes, uint32_t *val);
+
+static __inline void
+CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
+{
+
+ if (bytes == 1)
+ pci_set_cfgdata8(pi, coff, val);
+ else if (bytes == 2)
+ pci_set_cfgdata16(pi, coff, val);
+ else
+ pci_set_cfgdata32(pi, coff, val);
+}
+
+static __inline uint32_t
+CFGREAD(struct pci_devinst *pi, int coff, int bytes)
+{
+
+ if (bytes == 1)
+ return (pci_get_cfgdata8(pi, coff));
+ else if (bytes == 2)
+ return (pci_get_cfgdata16(pi, coff));
+ else
+ return (pci_get_cfgdata32(pi, coff));
+}
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <bus>:<slot>:<func>,<emul>[,<config>]
+ * <slot>[:<func>],<emul>[,<config>]
+ *
+ * slot is 0..31
+ * func is 0..7
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+
+ fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt);
+}
+
+int
+pci_parse_slot(char *opt)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ char *emul, *config, *str, *cp;
+ int error, bnum, snum, fnum;
+
+ error = -1;
+ str = strdup(opt);
+
+ emul = config = NULL;
+ if ((cp = strchr(str, ',')) != NULL) {
+ *cp = '\0';
+ emul = cp + 1;
+ if ((cp = strchr(emul, ',')) != NULL) {
+ *cp = '\0';
+ config = cp + 1;
+ }
+ } else {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ /* <bus>:<slot>:<func> */
+ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
+ bnum = 0;
+ /* <slot>:<func> */
+ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
+ fnum = 0;
+ /* <slot> */
+ if (sscanf(str, "%d", &snum) != 1) {
+ snum = -1;
+ }
+ }
+ }
+
+ if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
+ fnum < 0 || fnum >= MAXFUNCS) {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ if (pci_businfo[bnum] == NULL)
+ pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
+
+ bi = pci_businfo[bnum];
+ si = &bi->slotinfo[snum];
+
+ if (si->si_funcs[fnum].fi_name != NULL) {
+ fprintf(stderr, "pci slot %d:%d already occupied!\n",
+ snum, fnum);
+ goto done;
+ }
+
+ if (pci_emul_finddev(emul) == NULL) {
+ fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n",
+ snum, fnum, emul);
+ goto done;
+ }
+
+ error = 0;
+ si->si_funcs[fnum].fi_name = emul;
+ si->si_funcs[fnum].fi_param = config;
+
+done:
+ if (error)
+ free(str);
+
+ return (error);
+}
+
+void
+pci_print_supported_devices()
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ printf("%s\n", pdp->pe_emu);
+ }
+}
+
+static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+ if (offset < pi->pi_msix.pba_offset)
+ return (0);
+
+ if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ return (0);
+ }
+
+ return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value)
+{
+ int msix_entry_offset;
+ int tab_index;
+ char *dest;
+
+ /* support only 4 or 8 byte writes */
+ if (size != 4 && size != 8)
+ return (-1);
+
+ /*
+ * Return if table index is beyond what device supports
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (tab_index >= pi->pi_msix.table_count)
+ return (-1);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned writes */
+ if ((msix_entry_offset % size) != 0)
+ return (-1);
+
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 4)
+ *((uint32_t *)dest) = value;
+ else
+ *((uint64_t *)dest) = value;
+
+ return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+ char *dest;
+ int msix_entry_offset;
+ int tab_index;
+ uint64_t retval = ~0;
+
+ /*
+ * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
+ * table but we also allow 1 byte access to accommodate reads from
+ * ddb.
+ */
+ if (size != 1 && size != 4 && size != 8)
+ return (retval);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned reads */
+ if ((msix_entry_offset % size) != 0) {
+ return (retval);
+ }
+
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ if (tab_index < pi->pi_msix.table_count) {
+ /* valid MSI-X Table access */
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 1)
+ retval = *((uint8_t *)dest);
+ else if (size == 4)
+ retval = *((uint32_t *)dest);
+ else
+ retval = *((uint64_t *)dest);
+ } else if (pci_valid_pba_offset(pi, offset)) {
+ /* return 0 for PBA access */
+ retval = 0;
+ }
+
+ return (retval);
+}
+
+int
+pci_msix_table_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.table_bar);
+ else
+ return (-1);
+}
+
+int
+pci_msix_pba_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.pba_bar);
+ else
+ return (-1);
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+ offset, bytes);
+ else
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+ bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int) arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+ pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+
+ if (dir == MEM_F_WRITE) {
+ if (size == 8) {
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+ 4, *val & 0xffffffff);
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
+ 4, *val >> 32);
+ } else {
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+ size, *val);
+ }
+ } else {
+ if (size == 8) {
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset, 4);
+ *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset + 4, 4) << 32;
+ } else {
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset, size);
+ }
+ }
+
+ return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
+{
+
+ return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+/*
+ * Register (or unregister) the MMIO or I/O region associated with the BAR
+ * register 'idx' of an emulated pci device.
+ */
+static void
+modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
+{
+ int error;
+ struct inout_port iop;
+ struct mem_range mr;
+
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_IO:
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = pi->pi_name;
+ iop.port = pi->pi_bar[idx].addr;
+ iop.size = pi->pi_bar[idx].size;
+ if (registration) {
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_io_handler;
+ iop.arg = pi;
+ error = register_inout(&iop);
+ } else
+ error = unregister_inout(&iop);
+ break;
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = pi->pi_name;
+ mr.base = pi->pi_bar[idx].addr;
+ mr.size = pi->pi_bar[idx].size;
+ if (registration) {
+ mr.flags = MEM_F_RW;
+ mr.handler = pci_emul_mem_handler;
+ mr.arg1 = pi;
+ mr.arg2 = idx;
+ error = register_mem(&mr);
+ } else
+ error = unregister_mem(&mr);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ assert(error == 0);
+}
+
+static void
+unregister_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 0);
+}
+
+static void
+register_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 1);
+}
+
+/* Are we decoding i/o port accesses for the emulated pci device? */
+static int
+porten(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_PORTEN);
+}
+
+/* Are we decoding memory accesses for the emulated pci device? */
+static int
+memen(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_MEMEN);
+}
+
+/*
+ * Update the MMIO or I/O address that is decoded by the BAR register.
+ *
+ * If the pci device has enabled the address space decoding then intercept
+ * the address range decoded by the BAR register.
+ */
+static void
+update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
+{
+ int decode;
+
+ if (pi->pi_bar[idx].type == PCIBAR_IO)
+ decode = porten(pi);
+ else
+ decode = memen(pi);
+
+ if (decode)
+ unregister_bar(pi, idx);
+
+ switch (type) {
+ case PCIBAR_IO:
+ case PCIBAR_MEM32:
+ pi->pi_bar[idx].addr = addr;
+ break;
+ case PCIBAR_MEM64:
+ pi->pi_bar[idx].addr &= ~0xffffffffUL;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ case PCIBAR_MEMHI64:
+ pi->pi_bar[idx].addr &= 0xffffffff;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ default:
+ assert(0);
+ }
+
+ if (decode)
+ register_bar(pi, idx);
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ uint64_t *baseptr = NULL;
+ uint64_t limit = 0, lobits = 0;
+ uint64_t addr, mask, bar;
+ int error;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ /* Enforce minimum BAR sizes required by the PCI standard */
+ if (type == PCIBAR_IO) {
+ if (size < 4)
+ size = 4;
+ } else {
+ if (size < 16)
+ size = 16;
+ }
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ baseptr = &pci_emul_iobase;
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ } else {
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+ }
+ break;
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+#ifdef FreeBSD
+ assert(0);
+#else
+ abort();
+#endif
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ register_bar(pdi, idx);
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, reallen;
+ uint16_t sts;
+
+ assert(caplen > 0);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
+ capoff = CAP_START_OFFSET;
+ else
+ capoff = pi->pi_capend + 1;
+
+ /* Check if we have enough space */
+ if (capoff + reallen > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Set the previous capability pointer */
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else
+ pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, 0);
+
+ pi->pi_prevcap = capoff;
+ pi->pi_capend = capoff + reallen - 1;
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
+ int func, struct funcinfo *fi)
+{
+ struct pci_devinst *pdi;
+ int err;
+
+ pdi = calloc(1, sizeof(struct pci_devinst));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = bus;
+ pdi->pi_slot = slot;
+ pdi->pi_func = func;
+ pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
+ pdi->pi_lintr.pin = 0;
+ pdi->pi_lintr.state = IDLE;
+ pdi->pi_lintr.pirq_pin = 0;
+ pdi->pi_lintr.ioapic_irq = 0;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
+ if (err == 0)
+ fi->fi_devi = pdi;
+ else
+ free(pdi);
+
+ return (err);
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+ uint32_t msix_tab_size)
+{
+
+ assert(msix_tab_size % 4096 == 0);
+
+ bzero(msixcap, sizeof(struct msixcap));
+ msixcap->capid = PCIY_MSIX;
+
+ /*
+ * Message Control Register, all fields set to
+ * zero except for the Table Size.
+ * Note: Table size N is encoded as N-1
+ */
+ msixcap->msgctrl = msgnum - 1;
+
+ /*
+ * MSI-X BAR setup:
+ * - MSI-X table start at offset 0
+ * - PBA table starts at a 4K aligned offset after the MSI-X table
+ */
+ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+ msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+ int i, table_size;
+
+ assert(table_entries > 0);
+ assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+ table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table = calloc(1, table_size);
+
+ /* set mask bit of vector control register */
+ for (i = 0; i < table_entries; i++)
+ pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+ uint32_t tab_size;
+ struct msixcap msixcap;
+
+ assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+ assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+
+ tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+ /* Align table size to nearest 4K */
+ tab_size = roundup2(tab_size, 4096);
+
+ pi->pi_msix.table_bar = barnum;
+ pi->pi_msix.pba_bar = barnum;
+ pi->pi_msix.table_offset = 0;
+ pi->pi_msix.table_count = msgnum;
+ pi->pi_msix.pba_offset = tab_size;
+ pi->pi_msix.pba_size = PBA_SIZE(msgnum);
+
+ pci_msix_table_init(pi, msgnum);
+
+ pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
+
+ /* allocate memory for MSI-X Table and PBA */
+ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+ tab_size + pi->pi_msix.pba_size);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+ sizeof(msixcap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off;
+
+ off = offset - capoff;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
+ pci_lintr_update(pi);
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.addr = addrlo;
+ pi->pi_msi.msg_data = msgdata;
+ pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.maxmsgnum = 0;
+ }
+ pci_lintr_update(pi);
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+
+ /* XXX don't write to the readonly parts */
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+#define PCIECAP_VERSION 0x2
+int
+pci_emul_add_pciecap(struct pci_devinst *pi, int type)
+{
+ int err;
+ struct pciecap pciecap;
+
+ if (type != PCIEM_TYPE_ROOT_PORT)
+ return (-1);
+
+ bzero(&pciecap, sizeof(pciecap));
+
+ pciecap.capid = PCIY_EXPRESS;
+ pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
+ pciecap.link_capabilities = 0x411; /* gen1, x1 */
+ pciecap.link_status = 0x11; /* gen1, x1 */
+
+ err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
+ return (err);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (nextoff == 0)
+ break;
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly.
+ * However, some o/s's do 4-byte writes that include these.
+ * For this case, trim the write back to 2 bytes and adjust
+ * the data.
+ */
+ if (offset == capoff || offset == capoff + 1) {
+ if (offset == capoff && bytes == 4) {
+ bytes = 2;
+ offset += 2;
+ val >>= 16;
+ } else
+ return;
+ }
+
+ capid = pci_get_cfgdata8(pi, capoff);
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_MSIX:
+ msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_EXPRESS:
+ pciecap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ uint16_t sts;
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
+ return (1);
+ }
+ return (0);
+}
+
+static int
+pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ /*
+ * Ignore writes; return 0xff's for reads. The mem read code
+ * will take care of truncating to the correct size.
+ */
+ if (dir == MEM_F_READ) {
+ *val = 0xffffffffffffffff;
+ }
+
+ return (0);
+}
+
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int bytes, uint64_t *val, void *arg1, long arg2)
+{
+ int bus, slot, func, coff, in;
+
+ coff = addr & 0xfff;
+ func = (addr >> 12) & 0x7;
+ slot = (addr >> 15) & 0x1f;
+ bus = (addr >> 20) & 0xff;
+ in = (dir == MEM_F_READ);
+ if (in)
+ *val = ~0UL;
+ pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+ return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+ return (PCI_EMUL_ECFG_BASE);
+}
+
+#define BUSIO_ROUNDUP 32
+#define BUSMEM_ROUNDUP (1024 * 1024)
+
+int
+init_pci(struct vmctx *ctx)
+{
+ struct mem_range mr;
+ struct pci_devemu *pde;
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+ size_t lowmem;
+ int bus, slot, func;
+ int error;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = vm_get_lowmem_limit(ctx);
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+ /*
+ * Keep track of the i/o and memory resources allocated to
+ * this bus.
+ */
+ bi->iobase = pci_emul_iobase;
+ bi->membase32 = pci_emul_membase32;
+ bi->membase64 = pci_emul_membase64;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_name == NULL)
+ continue;
+ pde = pci_emul_finddev(fi->fi_name);
+ assert(pde != NULL);
+ error = pci_emul_init(ctx, pde, bus, slot,
+ func, fi);
+ if (error)
+ return (error);
+ }
+ }
+
+ /*
+ * Add some slop to the I/O and memory resources decoded by
+ * this bus to give a guest some flexibility if it wants to
+ * reprogram the BARs.
+ */
+ pci_emul_iobase += BUSIO_ROUNDUP;
+ pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
+ bi->iolimit = pci_emul_iobase;
+
+ pci_emul_membase32 += BUSMEM_ROUNDUP;
+ pci_emul_membase32 = roundup2(pci_emul_membase32,
+ BUSMEM_ROUNDUP);
+ bi->memlimit32 = pci_emul_membase32;
+
+ pci_emul_membase64 += BUSMEM_ROUNDUP;
+ pci_emul_membase64 = roundup2(pci_emul_membase64,
+ BUSMEM_ROUNDUP);
+ bi->memlimit64 = pci_emul_membase64;
+ }
+
+ /*
+ * PCI backends are initialized before routing INTx interrupts
+ * so that LPC devices are able to reserve ISA IRQs before
+ * routing PIRQ pins.
+ */
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_devi == NULL)
+ continue;
+ pci_lintr_route(fi->fi_devi);
+ }
+ }
+ }
+ lpc_pirq_routed();
+
+ /*
+ * The guest physical memory map looks like the following:
+ * [0, lowmem) guest system memory
+ * [lowmem, lowmem_limit) memory hole (may be absent)
+ * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation)
+ * [0xE0000000, 0xF0000000) PCI extended config window
+ * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
+ * [4GB, 4GB + highmem)
+ */
+
+ /*
+ * Accesses to memory addresses that are not allocated to system
+ * memory or PCI devices return 0xff's.
+ */
+ lowmem = vm_get_lowmem_size(ctx);
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI hole";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = lowmem;
+ mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+ mr.handler = pci_emul_fallback_handler;
+ error = register_mem_fallback(&mr);
+ assert(error == 0);
+
+ /* PCI extended config space */
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI ECFG";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = PCI_EMUL_ECFG_BASE;
+ mr.size = PCI_EMUL_ECFG_SIZE;
+ mr.handler = pci_emul_ecfg_handler;
+ error = register_mem(&mr);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" Zero,");
+ dsdt_line(" 0x%X", ioapic_irq);
+ dsdt_line(" },");
+}
+
+static void
+pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+ char *name;
+
+ name = lpc_pirq_name(pirq_pin);
+ if (name == NULL)
+ return;
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" %s,", name);
+ dsdt_line(" 0x00");
+ dsdt_line(" },");
+ free(name);
+}
+
+/*
+ * A bhyve virtual machine has a flat PCI hierarchy with a root port
+ * corresponding to each PCI bus.
+ */
+static void
+pci_bus_write_dsdt(int bus)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ int count, func, slot;
+
+ /*
+ * If there are no devices on this 'bus' then just return.
+ */
+ if ((bi = pci_businfo[bus]) == NULL) {
+ /*
+ * Bus 0 is special because it decodes the I/O ports used
+ * for PCI config space access even if there are no devices
+ * on it.
+ */
+ if (bus != 0)
+ return;
+ }
+
+ dsdt_line(" Device (PC%02X)", bus);
+ dsdt_line(" {");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))");
+ dsdt_line(" Name (_ADR, Zero)");
+
+ dsdt_line(" Method (_BBN, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x%08X)", bus);
+ dsdt_line(" }");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, "
+ "MaxFixed, PosDecode,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bus);
+ dsdt_line(" 0x%04X, // Range Maximum", bus);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0001, // Length");
+ dsdt_line(" ,, )");
+
+ if (bus == 0) {
+ dsdt_indent(3);
+ dsdt_fixed_ioport(0xCF8, 8);
+ dsdt_unindent(3);
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0000, // Range Minimum");
+ dsdt_line(" 0x0CF7, // Range Maximum");
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0CF8, // Length");
+ dsdt_line(" ,, , TypeStatic)");
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0D00, // Range Minimum");
+ dsdt_line(" 0x%04X, // Range Maximum",
+ PCI_EMUL_IOBASE - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ PCI_EMUL_IOBASE - 0x0D00);
+ dsdt_line(" ,, , TypeStatic)");
+
+ if (bi == NULL) {
+ dsdt_line(" })");
+ goto done;
+ }
+ }
+ assert(bi != NULL);
+
+ /* i/o window */
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bi->iobase);
+ dsdt_line(" 0x%04X, // Range Maximum",
+ bi->iolimit - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ bi->iolimit - bi->iobase);
+ dsdt_line(" ,, , TypeStatic)");
+
+ /* mmio window (32-bit) */
+ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x00000000, // Granularity");
+ dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32);
+ dsdt_line(" 0x%08X, // Range Maximum\n",
+ bi->memlimit32 - 1);
+ dsdt_line(" 0x00000000, // Translation Offset");
+ dsdt_line(" 0x%08X, // Length\n",
+ bi->memlimit32 - bi->membase32);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+
+ /* mmio window (64-bit) */
+ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x0000000000000000, // Granularity");
+ dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64);
+ dsdt_line(" 0x%016lX, // Range Maximum\n",
+ bi->memlimit64 - 1);
+ dsdt_line(" 0x0000000000000000, // Translation Offset");
+ dsdt_line(" 0x%016lX, // Length\n",
+ bi->memlimit64 - bi->membase64);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+ dsdt_line(" })");
+
+ count = pci_count_lintr(bus);
+ if (count != 0) {
+ dsdt_indent(2);
+ dsdt_line("Name (PPRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Name (APRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Method (_PRT, 0, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" If (PICM)");
+ dsdt_line(" {");
+ dsdt_line(" Return (APRT)");
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Return (PPRT)");
+ dsdt_line(" }");
+ dsdt_line("}");
+ dsdt_unindent(2);
+ }
+
+ dsdt_indent(2);
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ pi = si->si_funcs[func].fi_devi;
+ if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
+ pi->pi_d->pe_write_dsdt(pi);
+ }
+ }
+ dsdt_unindent(2);
+done:
+ dsdt_line(" }");
+}
+
+void
+pci_write_dsdt(void)
+{
+ int bus;
+
+ dsdt_indent(1);
+ dsdt_line("Name (PICM, 0x00)");
+ dsdt_line("Method (_PIC, 1, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" Store (Arg0, PICM)");
+ dsdt_line("}");
+ dsdt_line("");
+ dsdt_line("Scope (_SB)");
+ dsdt_line("{");
+ for (bus = 0; bus < MAXBUSES; bus++)
+ pci_bus_write_dsdt(bus);
+ dsdt_line("}");
+ dsdt_unindent(1);
+}
+
+int
+pci_bus_configured(int bus)
+{
+ assert(bus >= 0 && bus < MAXBUSES);
+ return (pci_businfo[bus] != NULL);
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_maxmsgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.maxmsgnum);
+ else
+ return (0);
+}
+
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+ return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+ struct msix_table_entry *mte;
+
+ if (!pci_msix_enabled(pi))
+ return;
+
+ if (pi->pi_msix.function_mask)
+ return;
+
+ if (index >= pi->pi_msix.table_count)
+ return;
+
+ mte = &pi->pi_msix.table[index];
+ if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* XXX Set PBA bit if interrupt is disabled */
+ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
+ }
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int index)
+{
+
+ if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
+ vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
+ pi->pi_msi.msg_data + index);
+ }
+}
+
+static bool
+pci_lintr_permitted(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+ return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
+ (cmd & PCIM_CMD_INTxDIS)));
+}
+
+void
+pci_lintr_request(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int bestpin, bestcount, pin;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+
+ /*
+ * Just allocate a pin from our slot. The pin will be
+ * assigned IRQs later when interrupts are routed.
+ */
+ si = &bi->slotinfo[pi->pi_slot];
+ bestpin = 0;
+ bestcount = si->si_intpins[0].ii_count;
+ for (pin = 1; pin < 4; pin++) {
+ if (si->si_intpins[pin].ii_count < bestcount) {
+ bestpin = pin;
+ bestcount = si->si_intpins[pin].ii_count;
+ }
+ }
+
+ si->si_intpins[bestpin].ii_count++;
+ pi->pi_lintr.pin = bestpin + 1;
+ pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
+}
+
+static void
+pci_lintr_route(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct intxinfo *ii;
+
+ if (pi->pi_lintr.pin == 0)
+ return;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+ ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
+
+ /*
+ * Attempt to allocate an I/O APIC pin for this intpin if one
+ * is not yet assigned.
+ */
+ if (ii->ii_ioapic_irq == 0)
+ ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi);
+ assert(ii->ii_ioapic_irq > 0);
+
+ /*
+ * Attempt to allocate a PIRQ pin for this intpin if one is
+ * not yet assigned.
+ */
+ if (ii->ii_pirq_pin == 0)
+ ii->ii_pirq_pin = pirq_alloc_pin(pi);
+ assert(ii->ii_pirq_pin > 0);
+
+ pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
+ pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
+ pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == IDLE) {
+ if (pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ } else
+ pi->pi_lintr.state = PENDING;
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED) {
+ pi->pi_lintr.state = IDLE;
+ pci_irq_deassert(pi);
+ } else if (pi->pi_lintr.state == PENDING)
+ pi->pi_lintr.state = IDLE;
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+static void
+pci_lintr_update(struct pci_devinst *pi)
+{
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
+ pci_irq_deassert(pi);
+ pi->pi_lintr.state = PENDING;
+ } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+#ifndef __FreeBSD__
+ if (pi->pi_d->pe_lintrupdate != NULL) {
+ pi->pi_d->pe_lintrupdate(pi);
+ }
+#endif /* __FreeBSD__ */
+}
+
+int
+pci_count_lintr(int bus)
+{
+ int count, slot, pin;
+ struct slotinfo *slotinfo;
+
+ count = 0;
+ if (pci_businfo[bus] != NULL) {
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ slotinfo = &pci_businfo[bus]->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ if (slotinfo->si_intpins[pin].ii_count != 0)
+ count++;
+ }
+ }
+ }
+ return (count);
+}
+
+void
+pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct intxinfo *ii;
+ int slot, pin;
+
+ if ((bi = pci_businfo[bus]) == NULL)
+ return;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ ii = &si->si_intpins[pin];
+ if (ii->ii_count != 0)
+ cb(bus, slot, pin + 1, ii->ii_pirq_pin,
+ ii->ii_ioapic_irq, arg);
+ }
+ }
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int bus, int slot)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int f, numfuncs;
+
+ numfuncs = 0;
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ for (f = 0; f < MAXFUNCS; f++) {
+ if (si->si_funcs[f].fi_devi != NULL) {
+ numfuncs++;
+ }
+ }
+ }
+ return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
+{
+ int mfdev;
+
+ if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+ mfdev = pci_emul_is_mfdev(bus, slot);
+ switch (bytes) {
+ case 1:
+ case 2:
+ *rv &= ~PCIM_MFDEV;
+ if (mfdev) {
+ *rv |= PCIM_MFDEV;
+ }
+ break;
+ case 4:
+ *rv &= ~(PCIM_MFDEV << 16);
+ if (mfdev) {
+ *rv |= (PCIM_MFDEV << 16);
+ }
+ break;
+ }
+ }
+}
+
+static void
+pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
+{
+ int i, rshift;
+ uint32_t cmd, cmd2, changed, old, readonly;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */
+
+ /*
+ * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
+ *
+ * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
+ * 'write 1 to clear'. However these bits are not set to '1' by
+ * any device emulation so it is simpler to treat them as readonly.
+ */
+ rshift = (coff & 0x3) * 8;
+ readonly = 0xFFFFF880 >> rshift;
+
+ old = CFGREAD(pi, coff, bytes);
+ new &= ~readonly;
+ new |= (old & readonly);
+ CFGWRITE(pi, coff, new, bytes); /* update config */
+
+ cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */
+ changed = cmd ^ cmd2;
+
+ /*
+ * If the MMIO or I/O address space decoding has changed then
+ * register/unregister all BARs that decode that address space.
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ switch (pi->pi_bar[i].type) {
+ case PCIBAR_NONE:
+ case PCIBAR_MEMHI64:
+ break;
+ case PCIBAR_IO:
+ /* I/O address space decoding changed? */
+ if (changed & PCIM_CMD_PORTEN) {
+ if (porten(pi))
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ /* MMIO address space decoding changed? */
+ if (changed & PCIM_CMD_MEMEN) {
+ if (memen(pi))
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ /*
+ * If INTx has been unmasked and is pending, assert the
+ * interrupt.
+ */
+ pci_lintr_update(pi);
+}
+
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+ int coff, int bytes, uint32_t *eax)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int idx, needcfg;
+ uint64_t addr, mask;
+ uint64_t bar = 0;
+
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ pi = si->si_funcs[func].fi_devi;
+ } else
+ pi = NULL;
+
+ /*
+ * Just return if there is no device at this slot:func or if the
+ * the guest is doing an un-aligned access.
+ */
+ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+ (coff & (bytes - 1)) != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return;
+ }
+
+ /*
+ * Ignore all writes beyond the standard config space and return all
+ * ones on reads.
+ */
+ if (coff >= PCI_REGMAX + 1) {
+ if (in) {
+ *eax = 0xffffffff;
+ /*
+ * Extended capabilities begin at offset 256 in config
+ * space. Absence of extended capabilities is signaled
+ * with all 0s in the extended capability header at
+ * offset 256.
+ */
+ if (coff <= PCI_REGMAX + 4)
+ *eax = 0x00000000;
+ }
+ return;
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL) {
+ needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+ eax);
+ } else {
+ needcfg = 1;
+ }
+
+ if (needcfg)
+ *eax = CFGREAD(pi, coff, bytes);
+
+ pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return;
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return;
+ idx = (coff - PCIR_BAR(0)) / 4;
+ mask = ~(pi->pi_bar[idx].size - 1);
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ pi->pi_bar[idx].addr = bar = 0;
+ break;
+ case PCIBAR_IO:
+ addr = *eax & mask;
+ addr &= 0xffff;
+ bar = addr | PCIM_BAR_IO_SPACE;
+ /*
+ * Register the new BAR value for interception
+ */
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_IO);
+ }
+ break;
+ case PCIBAR_MEM32:
+ addr = bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM32);
+ }
+ break;
+ case PCIBAR_MEM64:
+ addr = bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ if (addr != (uint32_t)pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM64);
+ }
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ addr = ((uint64_t)*eax << 32) & mask;
+ bar = addr >> 32;
+ if (bar != pi->pi_bar[idx - 1].addr >> 32) {
+ update_bar_address(pi, addr, idx - 1,
+ PCIBAR_MEMHI64);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
+ pci_emul_cmdsts_write(pi, coff, *eax, bytes);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ if (bytes != 4) {
+ if (in)
+ *eax = (bytes == 2) ? 0xffff : 0xff;
+ return (0);
+ }
+
+ if (in) {
+ x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+ if (cfgenable)
+ x |= CONF1_ENABLE;
+ *eax = x;
+ } else {
+ x = *eax;
+ cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+ }
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int coff;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+ if (cfgenable) {
+ pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+ eax);
+ } else {
+ /* Ignore accesses to cfgdata if not enabled by cfgaddr */
+ if (in)
+ *eax = 0xffffffff;
+ }
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ 8
+#define DMEMSZ 4096
+struct pci_emul_dsoftc {
+ uint8_t ioregs[DIOSZ];
+ uint8_t memregs[2][DMEMSZ];
+};
+
+#define PCI_EMUL_MSI_MSGS 4
+#define PCI_EMUL_MSIX_MSGS 16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = calloc(1, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("diow: iow too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->ioregs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->ioregs[offset] = value;
+ } else {
+ printf("diow: iow unknown size %d\n", size);
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+ }
+
+ if (baridx == 1 || baridx == 2) {
+ if (offset + size > DMEMSZ) {
+ printf("diow: memw too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ i = baridx - 1; /* 'memregs' index */
+
+ if (size == 1) {
+ sc->memregs[i][offset] = value;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->memregs[i][offset] = value;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->memregs[i][offset] = value;
+ } else if (size == 8) {
+ *(uint64_t *)&sc->memregs[i][offset] = value;
+ } else {
+ printf("diow: memw unknown size %d\n", size);
+ }
+
+ /*
+ * magic interrupt ??
+ */
+ }
+
+ if (baridx > 2 || baridx < 0) {
+ printf("diow: unknown bar idx %d\n", baridx);
+ }
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+ int i;
+
+ value = 0;
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("dior: ior too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->ioregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->ioregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->ioregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+ if (baridx == 1 || baridx == 2) {
+ if (offset + size > DMEMSZ) {
+ printf("dior: memr too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ i = baridx - 1; /* 'memregs' index */
+
+ if (size == 1) {
+ value = sc->memregs[i][offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->memregs[i][offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->memregs[i][offset];
+ } else if (size == 8) {
+ value = *(uint64_t *) &sc->memregs[i][offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+
+ if (baridx > 2 || baridx < 0) {
+ printf("dior: unknown bar idx %d\n", baridx);
+ return (0);
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_barwrite = pci_emul_diow,
+ .pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
new file mode 100644
index 0000000000..0053caed99
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -0,0 +1,298 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/_pthreadtypes.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *,
+ char *opts);
+
+ /* ACPI DSDT enumeration */
+ void (*pe_write_dsdt)(struct pci_devinst *);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* BAR read/write callbacks */
+ void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value);
+ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size);
+
+#ifndef __FreeBSD__
+ void (*pe_lintrupdate)(struct pci_devinst *pi);
+#endif /* __FreeBSD__ */
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct msix_table_entry {
+ uint64_t addr;
+ uint32_t msg_data;
+ uint32_t vector_control;
+} __packed;
+
+/*
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_ENTRIES 2048
+#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8)
+
+enum lintr_stat {
+ IDLE,
+ ASSERTED,
+ PENDING
+};
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ char pi_name[PI_NAMESZ];
+ int pi_bar_getsize;
+ int pi_prevcap;
+ int pi_capend;
+
+ struct {
+ int8_t pin;
+ enum lintr_stat state;
+ int pirq_pin;
+ int ioapic_irq;
+ pthread_mutex_t lock;
+ } pi_lintr;
+
+ struct {
+ int enabled;
+ uint64_t addr;
+ uint64_t msg_data;
+ int maxmsgnum;
+ } pi_msi;
+
+ struct {
+ int enabled;
+ int table_bar;
+ int pba_bar;
+ uint32_t table_offset;
+ int table_count;
+ uint32_t pba_offset;
+ int pba_size;
+ int function_mask;
+ struct msix_table_entry *table; /* allocated at runtime */
+ void *pba_page;
+ int pba_page_offset;
+ } pi_msix;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed");
+
+struct msixcap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t table_info; /* bar index and offset within it */
+ uint32_t pba_info; /* bar index and offset within it */
+} __packed;
+static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed");
+
+struct pciecap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t pcie_capabilities;
+
+ uint32_t dev_capabilities; /* all devices */
+ uint16_t dev_control;
+ uint16_t dev_status;
+
+ uint32_t link_capabilities; /* devices with links */
+ uint16_t link_control;
+ uint16_t link_status;
+
+ uint32_t slot_capabilities; /* ports with slots */
+ uint16_t slot_control;
+ uint16_t slot_status;
+
+ uint16_t root_control; /* root ports */
+ uint16_t root_capabilities;
+ uint32_t root_status;
+
+ uint32_t dev_capabilities2; /* all devices */
+ uint16_t dev_control2;
+ uint16_t dev_status2;
+
+ uint32_t link_capabilities2; /* devices with links */
+ uint16_t link_control2;
+ uint16_t link_status2;
+
+ uint32_t slot_capabilities2; /* ports with slots */
+ uint16_t slot_control2;
+ uint16_t slot_status2;
+} __packed;
+static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed");
+
+typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
+ int ioapic_irq, void *arg);
+
+int init_pci(struct vmctx *ctx);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+ uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void pci_lintr_assert(struct pci_devinst *pi);
+void pci_lintr_deassert(struct pci_devinst *pi);
+void pci_lintr_request(struct pci_devinst *pi);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msix_enabled(struct pci_devinst *pi);
+int pci_msix_table_bar(struct pci_devinst *pi);
+int pci_msix_pba_bar(struct pci_devinst *pi);
+int pci_msi_maxmsgnum(struct pci_devinst *pi);
+int pci_parse_slot(char *opt);
+void pci_print_supported_devices();
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
+int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value);
+uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
+int pci_count_lintr(int bus);
+void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
+void pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
+int pci_bus_configured(int bus);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c
new file mode 100644
index 0000000000..8d24dde9da
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_fbuf.c
@@ -0,0 +1,467 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "bhyvegc.h"
+#include "bhyverun.h"
+#include "console.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "rfb.h"
+#include "vga.h"
+
+/*
+ * bhyve Framebuffer device emulation.
+ * BAR0 points to the current mode information.
+ * BAR1 is the 32-bit framebuffer address.
+ *
+ * -s <b>,fbuf,wait,vga=on|io|off,rfb=<ip>:port,w=width,h=height
+ */
+
+static int fbuf_debug = 1;
+#define DEBUG_INFO 1
+#define DEBUG_VERBOSE 4
+#define DPRINTF(level, params) if (level <= fbuf_debug) printf params
+
+
+#define KB (1024UL)
+#define MB (1024 * 1024UL)
+
+#define DMEMSZ 128
+
+#define FB_SIZE (16*MB)
+
+#define COLS_MAX 1920
+#define ROWS_MAX 1200
+
+#define COLS_DEFAULT 1024
+#define ROWS_DEFAULT 768
+
+#define COLS_MIN 640
+#define ROWS_MIN 480
+
+struct pci_fbuf_softc {
+ struct pci_devinst *fsc_pi;
+ struct {
+ uint32_t fbsize;
+ uint16_t width;
+ uint16_t height;
+ uint16_t depth;
+ uint16_t refreshrate;
+ uint8_t reserved[116];
+ } __packed memregs;
+
+ /* rfb server */
+ char *rfb_host;
+ char *rfb_password;
+ int rfb_port;
+#ifndef __FreeBSD__
+ char *rfb_unix;
+#endif
+ int rfb_wait;
+ int vga_enabled;
+ int vga_full;
+
+ uint32_t fbaddr;
+ char *fb_base;
+ uint16_t gc_width;
+ uint16_t gc_height;
+ void *vgasc;
+ struct bhyvegc_image *gc_image;
+};
+
+static struct pci_fbuf_softc *fbuf_sc;
+
+#define PCI_FBUF_MSI_MSGS 4
+
+static void
+pci_fbuf_usage(char *opt)
+{
+
+ fprintf(stderr, "Invalid fbuf emulation option \"%s\"\r\n", opt);
+ fprintf(stderr, "fbuf: {wait,}{vga=on|io|off,}rfb=<ip>:port"
+ "{,w=width}{,h=height}\r\n");
+}
+
+static void
+pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_fbuf_softc *sc;
+ uint8_t *p;
+
+ assert(baridx == 0);
+
+ sc = pi->pi_arg;
+
+ DPRINTF(DEBUG_VERBOSE,
+ ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx\n",
+ offset, size, value));
+
+ if (offset + size > DMEMSZ) {
+ printf("fbuf: write too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ p = (uint8_t *)&sc->memregs + offset;
+
+ switch (size) {
+ case 1:
+ *p = value;
+ break;
+ case 2:
+ *(uint16_t *)p = value;
+ break;
+ case 4:
+ *(uint32_t *)p = value;
+ break;
+ case 8:
+ *(uint64_t *)p = value;
+ break;
+ default:
+ printf("fbuf: write unknown size %d\n", size);
+ break;
+ }
+
+ if (!sc->gc_image->vgamode && sc->memregs.width == 0 &&
+ sc->memregs.height == 0) {
+ DPRINTF(DEBUG_INFO, ("switching to VGA mode\r\n"));
+ sc->gc_image->vgamode = 1;
+ sc->gc_width = 0;
+ sc->gc_height = 0;
+ } else if (sc->gc_image->vgamode && sc->memregs.width != 0 &&
+ sc->memregs.height != 0) {
+ DPRINTF(DEBUG_INFO, ("switching to VESA mode\r\n"));
+ sc->gc_image->vgamode = 0;
+ }
+}
+
+uint64_t
+pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_fbuf_softc *sc;
+ uint8_t *p;
+ uint64_t value;
+
+ assert(baridx == 0);
+
+ sc = pi->pi_arg;
+
+
+ if (offset + size > DMEMSZ) {
+ printf("fbuf: read too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ p = (uint8_t *)&sc->memregs + offset;
+ value = 0;
+ switch (size) {
+ case 1:
+ value = *p;
+ break;
+ case 2:
+ value = *(uint16_t *)p;
+ break;
+ case 4:
+ value = *(uint32_t *)p;
+ break;
+ case 8:
+ value = *(uint64_t *)p;
+ break;
+ default:
+ printf("fbuf: read unknown size %d\n", size);
+ break;
+ }
+
+ DPRINTF(DEBUG_VERBOSE,
+ ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx\n",
+ offset, size, value));
+
+ return (value);
+}
+
+static int
+pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
+{
+ char *uopts, *xopts, *config;
+ char *tmpstr;
+ int ret;
+
+ ret = 0;
+ uopts = strdup(opts);
+ for (xopts = strtok(uopts, ",");
+ xopts != NULL;
+ xopts = strtok(NULL, ",")) {
+ if (strcmp(xopts, "wait") == 0) {
+ sc->rfb_wait = 1;
+ continue;
+ }
+
+ if ((config = strchr(xopts, '=')) == NULL) {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ }
+
+ *config++ = '\0';
+
+ DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s\r\n",
+ xopts, config));
+
+ if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) {
+ /*
+ * IPv4 -- host-ip:port
+ * IPv6 -- [host-ip%zone]:port
+ * XXX for now port is mandatory.
+ */
+ tmpstr = strsep(&config, "]");
+ if (config) {
+ if (tmpstr[0] == '[')
+ tmpstr++;
+ sc->rfb_host = tmpstr;
+ if (config[0] == ':')
+ config++;
+ else {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ }
+ sc->rfb_port = atoi(config);
+ } else {
+ config = tmpstr;
+ tmpstr = strsep(&config, ":");
+ if (!config)
+ sc->rfb_port = atoi(tmpstr);
+ else {
+ sc->rfb_port = atoi(config);
+ sc->rfb_host = tmpstr;
+ }
+ }
+#ifndef __FreeBSD__
+ } else if (!strcmp(xopts, "unix")) {
+ sc->rfb_unix = config;
+#endif
+ } else if (!strcmp(xopts, "vga")) {
+ if (!strcmp(config, "off")) {
+ sc->vga_enabled = 0;
+ } else if (!strcmp(config, "io")) {
+ sc->vga_enabled = 1;
+ sc->vga_full = 0;
+ } else if (!strcmp(config, "on")) {
+ sc->vga_enabled = 1;
+ sc->vga_full = 1;
+ } else {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ }
+ } else if (!strcmp(xopts, "w")) {
+ sc->memregs.width = atoi(config);
+ if (sc->memregs.width > COLS_MAX) {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ } else if (sc->memregs.width == 0)
+ sc->memregs.width = 1920;
+ } else if (!strcmp(xopts, "h")) {
+ sc->memregs.height = atoi(config);
+ if (sc->memregs.height > ROWS_MAX) {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ } else if (sc->memregs.height == 0)
+ sc->memregs.height = 1080;
+ } else if (!strcmp(xopts, "password")) {
+ sc->rfb_password = config;
+ } else {
+ pci_fbuf_usage(xopts);
+ ret = -1;
+ goto done;
+ }
+ }
+
+done:
+ return (ret);
+}
+
+
+extern void vga_render(struct bhyvegc *gc, void *arg);
+
+void
+pci_fbuf_render(struct bhyvegc *gc, void *arg)
+{
+ struct pci_fbuf_softc *sc;
+
+ sc = arg;
+
+ if (sc->vga_full && sc->gc_image->vgamode) {
+ /* TODO: mode switching to vga and vesa should use the special
+ * EFI-bhyve protocol port.
+ */
+ vga_render(gc, sc->vgasc);
+ return;
+ }
+ if (sc->gc_width != sc->memregs.width ||
+ sc->gc_height != sc->memregs.height) {
+ bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height);
+ sc->gc_width = sc->memregs.width;
+ sc->gc_height = sc->memregs.height;
+ }
+
+ return;
+}
+
+static int
+pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error, prot;
+ struct pci_fbuf_softc *sc;
+
+ if (fbuf_sc != NULL) {
+ fprintf(stderr, "Only one frame buffer device is allowed.\n");
+ return (-1);
+ }
+
+ sc = calloc(1, sizeof(struct pci_fbuf_softc));
+
+ pi->pi_arg = sc;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE);
+ assert(error == 0);
+
+ error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS);
+ assert(error == 0);
+
+ sc->fbaddr = pi->pi_bar[1].addr;
+ sc->memregs.fbsize = FB_SIZE;
+ sc->memregs.width = COLS_DEFAULT;
+ sc->memregs.height = ROWS_DEFAULT;
+ sc->memregs.depth = 32;
+
+ sc->vga_enabled = 1;
+ sc->vga_full = 0;
+
+ sc->fsc_pi = pi;
+
+ error = pci_fbuf_parse_opts(sc, opts);
+ if (error != 0)
+ goto done;
+
+ /* XXX until VGA rendering is enabled */
+ if (sc->vga_full != 0) {
+ fprintf(stderr, "pci_fbuf: VGA rendering not enabled");
+ goto done;
+ }
+
+ sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE);
+ if (sc->fb_base == MAP_FAILED) {
+ error = -1;
+ goto done;
+ }
+ DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]\r\n",
+ sc->fb_base, FB_SIZE));
+
+ /*
+ * Map the framebuffer into the guest address space.
+ * XXX This may fail if the BAR is different than a prior
+ * run. In this case flag the error. This will be fixed
+ * when a change_memseg api is available.
+ */
+ prot = PROT_READ | PROT_WRITE;
+ if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) {
+ fprintf(stderr, "pci_fbuf: mapseg failed - try deleting VM and restarting\n");
+ error = -1;
+ goto done;
+ }
+
+ console_init(sc->memregs.width, sc->memregs.height, sc->fb_base);
+ console_fb_register(pci_fbuf_render, sc);
+
+ if (sc->vga_enabled)
+ sc->vgasc = vga_init(!sc->vga_full);
+ sc->gc_image = console_get_image();
+
+ fbuf_sc = sc;
+
+ memset((void *)sc->fb_base, 0, FB_SIZE);
+
+#ifdef __FreeBSD__
+ error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password);
+#else
+ if (sc->rfb_unix != NULL) {
+ error = rfb_init_unix(sc->rfb_unix, sc->rfb_wait,
+ sc->rfb_password);
+ } else {
+ error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait,
+ sc->rfb_password);
+ }
+#endif
+done:
+ if (error)
+ free(sc);
+
+ return (error);
+}
+
+struct pci_devemu pci_fbuf = {
+ .pe_emu = "fbuf",
+ .pe_init = pci_fbuf_init,
+ .pe_barwrite = pci_fbuf_write,
+ .pe_barread = pci_fbuf_read
+};
+PCI_EMUL_SET(pci_fbuf);
diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000000..b926c7817e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_hostbridge.c
@@ -0,0 +1,236 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#ifndef __FreeBSD__
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <strings.h>
+#endif
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+#ifdef __FreeBSD__
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
+
+ return (0);
+}
+
+static int
+pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ (void) pci_hostbridge_init(ctx, pi, opts);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */
+
+ return (0);
+}
+#else
+static void
+pci_hostbridge_setup(struct pci_devinst *pi, uint16_t vendor, uint16_t device)
+{
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, vendor);
+ pci_set_cfgdata16(pi, PCIR_DEVICE, device);
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
+}
+
+
+static int
+pci_hostbridge_parse_pci_val(const char *in, uint16_t *val)
+{
+ long num;
+ char *endp = NULL;
+
+ errno = 0;
+ num = strtol(in, &endp, 0);
+ if (errno != 0 || endp == NULL || *endp != '\0') {
+ fprintf(stderr, "pci_hostbridge: invalid num '%s'", in);
+ return (-1);
+ } else if (num < 1 || num > UINT16_MAX) {
+ fprintf(stderr, "pci_hostbridge: 0x%04lx out of range", num);
+ return (-1);
+ }
+ *val = num;
+ return (0);
+}
+
+static struct pci_hostbridge_model {
+ const char *phm_model;
+ uint16_t phm_vendor;
+ uint16_t phm_device;
+} pci_hb_models[] = {
+ { "amd", 0x1022, 0x7432 }, /* AMD/made-up */
+ { "netapp", 0x1275, 0x1275 }, /* NetApp/NetApp */
+ { "i440fx", 0x8086, 0x1237 }, /* Intel/82441 */
+ { "q35", 0x8086, 0x29b0 }, /* Intel/Q35 HB */
+};
+
+#define NUM_HB_MODELS (sizeof (pci_hb_models) / sizeof (pci_hb_models[0]))
+
+static int
+pci_hostbridge_parse_args(char *opts, uint16_t *vendorp, uint16_t *devicep)
+{
+ const char *model = NULL;
+ char *next;
+ uint16_t vendor = 0, device = 0;
+ int err = 0;
+
+ for (; opts != NULL && *opts != '\0'; opts = next) {
+ char *val, *cp;
+
+ if ((cp = strchr(opts, ',')) != NULL) {
+ *cp = '\0';
+ next = cp + 1;
+ } else {
+ next = NULL;
+ }
+
+ if ((cp = strchr(opts, '=')) == NULL) {
+ fprintf(stderr,
+ "pci_hostbridge: expected value for param"
+ " (%s=VAL)", opts);
+ err = -1;
+ continue;
+ }
+
+ /* <param>=<value> handling */
+ val = cp + 1;
+ *cp = '\0';
+ if (strcmp(opts, "model") == 0) {
+ model = val;
+ } else if (strcmp(opts, "vendor") == 0) {
+ if (pci_hostbridge_parse_pci_val(val, &vendor) != 0) {
+ err = -1;
+ continue;
+ }
+ } else if (strcmp(opts, "device") == 0) {
+ if (pci_hostbridge_parse_pci_val(val, &device) != 0) {
+ err = -1;
+ continue;
+ }
+ } else {
+ fprintf(stderr,
+ "pci_hostbridge: unrecognized option '%s'", opts);
+ err = -1;
+ continue;
+ }
+ }
+ if (err != 0) {
+ return (err);
+ }
+
+ if (model != NULL && (vendor != 0 || device != 0)) {
+ fprintf(stderr, "pci_hostbridge: cannot specify model "
+ "and vendor/device");
+ return (-1);
+ } else if ((vendor != 0 && device == 0) ||
+ (vendor == 0 && device != 0)) {
+ fprintf(stderr, "pci_hostbridge: must specify both vendor and"
+ "device for custom hostbridge");
+ return (-1);
+ }
+ if (model != NULL) {
+ uint_t i;
+
+ for (i = 0; i < NUM_HB_MODELS; i++) {
+ if (strcmp(model, pci_hb_models[i].phm_model) != 0)
+ continue;
+
+ /* found a model match */
+ *vendorp = pci_hb_models[i].phm_vendor;
+ *devicep = pci_hb_models[i].phm_device;
+ return (0);
+ }
+ fprintf(stderr, "pci_hostbridge: invalid model '%s'", model);
+ return (-1);
+ }
+
+ /* custom hostbridge ID was specified */
+ *vendorp = vendor;
+ *devicep = device;
+ return (0);
+}
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ uint16_t vendor, device;
+
+ if (opts == NULL) {
+ /* Fall back to NetApp default if no options are specified */
+ vendor = 0x1275;
+ device = 0x1275;
+ } else if (pci_hostbridge_parse_args(opts, &vendor, &device) != 0) {
+ return (-1);
+ }
+
+ pci_hostbridge_setup(pi, vendor, device);
+ return (0);
+}
+
+static int
+pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ pci_hostbridge_setup(pi, 0x1022, 0x7432);
+ return (0);
+}
+
+#endif /* __FreeBSD__ */
+
+struct pci_devemu pci_de_amd_hostbridge = {
+ .pe_emu = "amd_hostbridge",
+ .pe_init = pci_amd_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_amd_hostbridge);
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c
new file mode 100644
index 0000000000..4ecb3eddb0
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.c
@@ -0,0 +1,354 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+/*
+ * Implement an 8 pin PCI interrupt router compatible with the router
+ * present on Intel's ICH10 chip.
+ */
+
+/* Fields in each PIRQ register. */
+#define PIRQ_DIS 0x80
+#define PIRQ_IRQ 0x0f
+
+/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
+#define PERMITTED_IRQS 0xdef8
+#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0)
+
+/* IRQ count to disable an IRQ. */
+#define IRQ_DISABLED 0xff
+
+static struct pirq {
+ uint8_t reg;
+ int use_count;
+ int active_count;
+ pthread_mutex_t lock;
+} pirqs[8];
+
+static u_char irq_counts[16];
+static int pirq_cold = 1;
+
+/*
+ * Returns true if this pin is enabled with a valid IRQ. Setting the
+ * register to a reserved IRQ causes interrupts to not be asserted as
+ * if the pin was disabled.
+ */
+static bool
+pirq_valid_irq(int reg)
+{
+
+ if (reg & PIRQ_DIS)
+ return (false);
+ return (IRQ_PERMITTED(reg & PIRQ_IRQ));
+}
+
+uint8_t
+pirq_read(int pin)
+{
+
+ assert(pin > 0 && pin <= nitems(pirqs));
+ return (pirqs[pin - 1].reg);
+}
+
+void
+pirq_write(struct vmctx *ctx, int pin, uint8_t val)
+{
+ struct pirq *pirq;
+
+ assert(pin > 0 && pin <= nitems(pirqs));
+ pirq = &pirqs[pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
+ if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+ vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+ pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
+ if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+ vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+ }
+ pthread_mutex_unlock(&pirq->lock);
+}
+
+void
+pci_irq_reserve(int irq)
+{
+
+ assert(irq >= 0 && irq < nitems(irq_counts));
+ assert(pirq_cold);
+ assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
+ irq_counts[irq] = IRQ_DISABLED;
+}
+
+void
+pci_irq_use(int irq)
+{
+
+ assert(irq >= 0 && irq < nitems(irq_counts));
+ assert(pirq_cold);
+ assert(irq_counts[irq] != IRQ_DISABLED);
+ irq_counts[irq]++;
+}
+
+void
+pci_irq_init(struct vmctx *ctx)
+{
+ int i;
+
+ for (i = 0; i < nitems(pirqs); i++) {
+ pirqs[i].reg = PIRQ_DIS;
+ pirqs[i].use_count = 0;
+ pirqs[i].active_count = 0;
+ pthread_mutex_init(&pirqs[i].lock, NULL);
+ }
+ for (i = 0; i < nitems(irq_counts); i++) {
+ if (IRQ_PERMITTED(i))
+ irq_counts[i] = 0;
+ else
+ irq_counts[i] = IRQ_DISABLED;
+ }
+}
+
+void
+pci_irq_assert(struct pci_devinst *pi)
+{
+ struct pirq *pirq;
+
+ if (pi->pi_lintr.pirq_pin > 0) {
+ assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+ pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ pirq->active_count++;
+ if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
+ vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+ pi->pi_lintr.ioapic_irq);
+ pthread_mutex_unlock(&pirq->lock);
+ return;
+ }
+ pthread_mutex_unlock(&pirq->lock);
+ }
+ vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+void
+pci_irq_deassert(struct pci_devinst *pi)
+{
+ struct pirq *pirq;
+
+ if (pi->pi_lintr.pirq_pin > 0) {
+ assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+ pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ pirq->active_count--;
+ if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
+ vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+ pi->pi_lintr.ioapic_irq);
+ pthread_mutex_unlock(&pirq->lock);
+ return;
+ }
+ pthread_mutex_unlock(&pirq->lock);
+ }
+ vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+int
+pirq_alloc_pin(struct pci_devinst *pi)
+{
+ struct vmctx *ctx = pi->pi_vmctx;
+ int best_count, best_irq, best_pin, irq, pin;
+
+ pirq_cold = 0;
+
+ if (lpc_bootrom()) {
+ /* For external bootrom use fixed mapping. */
+ best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8;
+ } else {
+ /* Find the least-used PIRQ pin. */
+ best_pin = 0;
+ best_count = pirqs[0].use_count;
+ for (pin = 1; pin < nitems(pirqs); pin++) {
+ if (pirqs[pin].use_count < best_count) {
+ best_pin = pin;
+ best_count = pirqs[pin].use_count;
+ }
+ }
+ }
+ pirqs[best_pin].use_count++;
+
+ /* Second, route this pin to an IRQ. */
+ if (pirqs[best_pin].reg == PIRQ_DIS) {
+ best_irq = -1;
+ best_count = 0;
+ for (irq = 0; irq < nitems(irq_counts); irq++) {
+ if (irq_counts[irq] == IRQ_DISABLED)
+ continue;
+ if (best_irq == -1 || irq_counts[irq] < best_count) {
+ best_irq = irq;
+ best_count = irq_counts[irq];
+ }
+ }
+ assert(best_irq >= 0);
+ irq_counts[best_irq]++;
+ pirqs[best_pin].reg = best_irq;
+ vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
+ }
+
+ return (best_pin + 1);
+}
+
+int
+pirq_irq(int pin)
+{
+ assert(pin > 0 && pin <= nitems(pirqs));
+ return (pirqs[pin - 1].reg & PIRQ_IRQ);
+}
+
+/* XXX: Generate $PIR table. */
+
+static void
+pirq_dsdt(void)
+{
+ char *irq_prs, *old;
+ int irq, pin;
+
+ irq_prs = NULL;
+ for (irq = 0; irq < nitems(irq_counts); irq++) {
+ if (!IRQ_PERMITTED(irq))
+ continue;
+ if (irq_prs == NULL)
+ asprintf(&irq_prs, "%d", irq);
+ else {
+ old = irq_prs;
+ asprintf(&irq_prs, "%s,%d", old, irq);
+ free(old);
+ }
+ }
+
+ /*
+ * A helper method to validate a link register's value. This
+ * duplicates pirq_valid_irq().
+ */
+ dsdt_line("");
+ dsdt_line("Method (PIRV, 1, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS);
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
+ dsdt_line(" If (LLess (Local0, 0x03))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" If (LEqual (Local0, 0x08))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" If (LEqual (Local0, 0x0D))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" Return (0x01)");
+ dsdt_line("}");
+
+ for (pin = 0; pin < nitems(pirqs); pin++) {
+ dsdt_line("");
+ dsdt_line("Device (LNK%c)", 'A' + pin);
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))");
+ dsdt_line(" Name (_UID, 0x%02X)", pin + 1);
+ dsdt_line(" Method (_STA, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" If (PIRV (PIR%c))", 'A' + pin);
+ dsdt_line(" {");
+ dsdt_line(" Return (0x0B)");
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x09)");
+ dsdt_line(" }");
+ dsdt_line(" }");
+ dsdt_line(" Name (_PRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
+ dsdt_line(" {%s}", irq_prs);
+ dsdt_line(" })");
+ dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1);
+ dsdt_line(" {");
+ dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
+ dsdt_line(" {}");
+ dsdt_line(" })");
+ dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)",
+ pin + 1, 'A' + pin);
+ dsdt_line(" Method (_CRS, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin,
+ PIRQ_DIS | PIRQ_IRQ);
+ dsdt_line(" If (PIRV (Local0))");
+ dsdt_line(" {");
+ dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Store (0x00, CIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Return (CB%02X)", pin + 1);
+ dsdt_line(" }");
+ dsdt_line(" Method (_DIS, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" Store (0x80, PIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Method (_SRS, 1, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
+ dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin);
+ dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line("}");
+ }
+ free(irq_prs);
+}
+LPC_DSDT(pirq_dsdt);
diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h
new file mode 100644
index 0000000000..1ae56efc8f
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.h
@@ -0,0 +1,47 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __PCI_IRQ_H__
+#define __PCI_IRQ_H__
+
+struct pci_devinst;
+
+void pci_irq_assert(struct pci_devinst *pi);
+void pci_irq_deassert(struct pci_devinst *pi);
+void pci_irq_init(struct vmctx *ctx);
+void pci_irq_reserve(int irq);
+void pci_irq_use(int irq);
+int pirq_alloc_pin(struct pci_devinst *pi);
+int pirq_irq(int pin);
+uint8_t pirq_read(int pin);
+void pirq_write(struct vmctx *ctx, int pin, uint8_t val);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c
new file mode 100644
index 0000000000..b7ddb772a1
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.c
@@ -0,0 +1,481 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "bootrom.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "uart_emul.h"
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+
+SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
+SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
+
+#define ELCR_PORT 0x4d0
+SYSRES_IO(ELCR_PORT, 2);
+
+#define IO_TIMER1_PORT 0x40
+
+#define NMISC_PORT 0x61
+SYSRES_IO(NMISC_PORT, 1);
+
+static struct pci_devinst *lpc_bridge;
+
+static const char *romfile;
+
+#define LPC_UART_NUM 2
+static struct lpc_uart_softc {
+ struct uart_softc *uart_softc;
+ const char *opts;
+ int iobase;
+ int irq;
+ int enabled;
+} lpc_uart_softc[LPC_UART_NUM];
+
+static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
+
+/*
+ * LPC device configuration is in the following form:
+ * <lpc_device_name>[,<options>]
+ * For e.g. "com1,stdio" or "bootrom,/var/romfile"
+ */
+int
+lpc_device_parse(const char *opts)
+{
+ int unit, error;
+ char *str, *cpy, *lpcdev;
+
+ error = -1;
+ str = cpy = strdup(opts);
+ lpcdev = strsep(&str, ",");
+ if (lpcdev != NULL) {
+ if (strcasecmp(lpcdev, "bootrom") == 0) {
+ romfile = str;
+ error = 0;
+ goto done;
+ }
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
+ lpc_uart_softc[unit].opts = str;
+ error = 0;
+ goto done;
+ }
+ }
+ }
+
+done:
+ if (error)
+ free(cpy);
+
+ return (error);
+}
+
+void
+lpc_print_supported_devices()
+{
+ size_t i;
+
+ printf("bootrom\n");
+ for (i = 0; i < LPC_UART_NUM; i++)
+ printf("%s\n", lpc_uart_names[i]);
+}
+
+const char *
+lpc_bootrom(void)
+{
+
+ return (romfile);
+}
+
+static void
+lpc_uart_intr_assert(void *arg)
+{
+ struct lpc_uart_softc *sc = arg;
+
+ assert(sc->irq >= 0);
+
+ vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
+}
+
+static void
+lpc_uart_intr_deassert(void *arg)
+{
+ /*
+ * The COM devices on the LPC bus generate edge triggered interrupts,
+ * so nothing more to do here.
+ */
+}
+
+static int
+lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int offset;
+ struct lpc_uart_softc *sc = arg;
+
+ offset = port - sc->iobase;
+
+ switch (bytes) {
+ case 1:
+ if (in)
+ *eax = uart_read(sc->uart_softc, offset);
+ else
+ uart_write(sc->uart_softc, offset, *eax);
+ break;
+ case 2:
+ if (in) {
+ *eax = uart_read(sc->uart_softc, offset);
+ *eax |= uart_read(sc->uart_softc, offset + 1) << 8;
+ } else {
+ uart_write(sc->uart_softc, offset, *eax);
+ uart_write(sc->uart_softc, offset + 1, *eax >> 8);
+ }
+ break;
+#ifndef __FreeBSD__
+ case 4:
+ if (in) {
+ *eax = uart_read(sc->uart_softc, offset);
+ *eax |= uart_read(sc->uart_softc, offset + 1) << 8;
+ *eax |= uart_read(sc->uart_softc, offset + 2) << 16;
+ *eax |= uart_read(sc->uart_softc, offset + 3) << 24;
+ } else {
+ uart_write(sc->uart_softc, offset, *eax);
+ uart_write(sc->uart_softc, offset + 1, *eax >> 8);
+ uart_write(sc->uart_softc, offset + 2, *eax >> 16);
+ uart_write(sc->uart_softc, offset + 3, *eax >> 24);
+ }
+ break;
+#endif
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+lpc_init(struct vmctx *ctx)
+{
+ struct lpc_uart_softc *sc;
+ struct inout_port iop;
+ const char *name;
+ int unit, error;
+
+ if (romfile != NULL) {
+ error = bootrom_init(ctx, romfile);
+ if (error)
+ return (error);
+ }
+
+ /* COM1 and COM2 */
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = &lpc_uart_softc[unit];
+ name = lpc_uart_names[unit];
+
+ if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
+ fprintf(stderr, "Unable to allocate resources for "
+ "LPC device %s\n", name);
+ return (-1);
+ }
+ pci_irq_reserve(sc->irq);
+
+ sc->uart_softc = uart_init(lpc_uart_intr_assert,
+ lpc_uart_intr_deassert, sc);
+
+ if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
+ fprintf(stderr, "Unable to initialize backend '%s' "
+ "for LPC device %s\n", sc->opts, name);
+ return (-1);
+ }
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = name;
+ iop.port = sc->iobase;
+ iop.size = UART_IO_BAR_SIZE;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = lpc_uart_io_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+ sc->enabled = 1;
+ }
+
+ return (0);
+}
+
+static void
+pci_lpc_write_dsdt(struct pci_devinst *pi)
+{
+ struct lpc_dsdt **ldpp, *ldp;
+
+ dsdt_line("");
+ dsdt_line("Device (ISA)");
+ dsdt_line("{");
+ dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
+ dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
+ dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)");
+ dsdt_line(" {");
+ dsdt_line(" Offset (0x60),");
+ dsdt_line(" PIRA, 8,");
+ dsdt_line(" PIRB, 8,");
+ dsdt_line(" PIRC, 8,");
+ dsdt_line(" PIRD, 8,");
+ dsdt_line(" Offset (0x68),");
+ dsdt_line(" PIRE, 8,");
+ dsdt_line(" PIRF, 8,");
+ dsdt_line(" PIRG, 8,");
+ dsdt_line(" PIRH, 8");
+ dsdt_line(" }");
+ dsdt_line("");
+
+ dsdt_indent(1);
+ SET_FOREACH(ldpp, lpc_dsdt_set) {
+ ldp = *ldpp;
+ ldp->handler();
+ }
+
+ dsdt_line("");
+ dsdt_line("Device (PIC)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_ICU1, 2);
+ dsdt_fixed_ioport(IO_ICU2, 2);
+ dsdt_fixed_irq(2);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+
+ dsdt_line("");
+ dsdt_line("Device (TIMR)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
+ dsdt_fixed_irq(0);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+ dsdt_unindent(1);
+
+ dsdt_line("}");
+}
+
+static void
+pci_lpc_sysres_dsdt(void)
+{
+ struct lpc_sysres **lspp, *lsp;
+
+ dsdt_line("");
+ dsdt_line("Device (SIO)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+
+ dsdt_indent(2);
+ SET_FOREACH(lspp, lpc_sysres_set) {
+ lsp = *lspp;
+ switch (lsp->type) {
+ case LPC_SYSRES_IO:
+ dsdt_fixed_ioport(lsp->base, lsp->length);
+ break;
+ case LPC_SYSRES_MEM:
+ dsdt_fixed_mem32(lsp->base, lsp->length);
+ break;
+ }
+ }
+ dsdt_unindent(2);
+
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(pci_lpc_sysres_dsdt);
+
+static void
+pci_lpc_uart_dsdt(void)
+{
+ struct lpc_uart_softc *sc;
+ int unit;
+
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = &lpc_uart_softc[unit];
+ if (!sc->enabled)
+ continue;
+ dsdt_line("");
+ dsdt_line("Device (%s)", lpc_uart_names[unit]);
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))");
+ dsdt_line(" Name (_UID, %d)", unit + 1);
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
+ dsdt_fixed_irq(sc->irq);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+ }
+}
+LPC_DSDT(pci_lpc_uart_dsdt);
+
+static int
+pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t val)
+{
+ int pirq_pin;
+
+ if (bytes == 1) {
+ pirq_pin = 0;
+ if (coff >= 0x60 && coff <= 0x63)
+ pirq_pin = coff - 0x60 + 1;
+ if (coff >= 0x68 && coff <= 0x6b)
+ pirq_pin = coff - 0x68 + 5;
+ if (pirq_pin != 0) {
+ pirq_write(ctx, pirq_pin, val);
+ pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static void
+pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+}
+
+static uint64_t
+pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ return (0);
+}
+
+#define LPC_DEV 0x7000
+#define LPC_VENDOR 0x8086
+
+static int
+pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /*
+ * Do not allow more than one LPC bridge to be configured.
+ */
+ if (lpc_bridge != NULL) {
+ fprintf(stderr, "Only one LPC bridge is allowed.\n");
+ return (-1);
+ }
+
+ /*
+ * Enforce that the LPC can only be configured on bus 0. This
+ * simplifies the ACPI DSDT because it can provide a decode for
+ * all legacy i/o ports behind bus 0.
+ */
+ if (pi->pi_bus != 0) {
+ fprintf(stderr, "LPC bridge can be present only on bus 0.\n");
+ return (-1);
+ }
+
+ if (lpc_init(ctx) != 0)
+ return (-1);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
+
+ lpc_bridge = pi;
+
+ return (0);
+}
+
+char *
+lpc_pirq_name(int pin)
+{
+ char *name;
+
+ if (lpc_bridge == NULL)
+ return (NULL);
+ asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
+ return (name);
+}
+
+void
+lpc_pirq_routed(void)
+{
+ int pin;
+
+ if (lpc_bridge == NULL)
+ return;
+
+ for (pin = 0; pin < 4; pin++)
+ pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
+ for (pin = 0; pin < 4; pin++)
+ pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
+}
+
+struct pci_devemu pci_de_lpc = {
+ .pe_emu = "lpc",
+ .pe_init = pci_lpc_init,
+ .pe_write_dsdt = pci_lpc_write_dsdt,
+ .pe_cfgwrite = pci_lpc_cfgwrite,
+ .pe_barwrite = pci_lpc_write,
+ .pe_barread = pci_lpc_read
+};
+PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h
new file mode 100644
index 0000000000..9041f79c50
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.h
@@ -0,0 +1,76 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LPC_H_
+#define _LPC_H_
+
+#include <sys/linker_set.h>
+
+typedef void (*lpc_write_dsdt_t)(void);
+
+struct lpc_dsdt {
+ lpc_write_dsdt_t handler;
+};
+
+#define LPC_DSDT(handler) \
+ static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \
+ (handler), \
+ }; \
+ DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__))
+
+enum lpc_sysres_type {
+ LPC_SYSRES_IO,
+ LPC_SYSRES_MEM
+};
+
+struct lpc_sysres {
+ enum lpc_sysres_type type;
+ uint32_t base;
+ uint32_t length;
+};
+
+#define LPC_SYSRES(type, base, length) \
+ static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \
+ (type), \
+ (base), \
+ (length) \
+ }; \
+ DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__))
+
+#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length)
+#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length)
+
+int lpc_device_parse(const char *opt);
+void lpc_print_supported_devices();
+char *lpc_pirq_name(int pin);
+void lpc_pirq_routed(void);
+const char *lpc_bootrom(void);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c
new file mode 100644
index 0000000000..387611c888
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_nvme.c
@@ -0,0 +1,1897 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017 Shunsuke Mie
+ * Copyright (c) 2018 Leon Dang
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * bhyve PCIe-NVMe device emulation.
+ *
+ * options:
+ * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
+ *
+ * accepted devpath:
+ * /dev/blockdev
+ * /path/to/image
+ * ram=size_in_MiB
+ *
+ * maxq = max number of queues
+ * qsz = max elements in each queue
+ * ioslots = max number of concurrent io requests
+ * sectsz = sector size (defaults to blockif sector size)
+ * ser = serial number (20-chars max)
+ *
+ */
+
+/* TODO:
+ - create async event for smart and log
+ - intr coalesce
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/atomic.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <dev/nvme/nvme.h>
+
+#include "bhyverun.h"
+#include "block_if.h"
+#include "pci_emul.h"
+
+
+static int nvme_debug = 0;
+#define DPRINTF(params) if (nvme_debug) printf params
+#define WPRINTF(params) printf params
+
+/* defaults; can be overridden */
+#define NVME_MSIX_BAR 4
+
+#define NVME_IOSLOTS 8
+
+#define NVME_QUEUES 16
+#define NVME_MAX_QENTRIES 2048
+
+#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
+#define NVME_MAX_BLOCKIOVS 512
+
+/* helpers */
+
+/* Convert a zero-based value into a one-based value */
+#define ONE_BASED(zero) ((zero) + 1)
+/* Convert a one-based value into a zero-based value */
+#define ZERO_BASED(one) ((one) - 1)
+
+/* Encode number of SQ's and CQ's for Set/Get Features */
+#define NVME_FEATURE_NUM_QUEUES(sc) \
+ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
+ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
+
+#define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
+
+enum nvme_controller_register_offsets {
+ NVME_CR_CAP_LOW = 0x00,
+ NVME_CR_CAP_HI = 0x04,
+ NVME_CR_VS = 0x08,
+ NVME_CR_INTMS = 0x0c,
+ NVME_CR_INTMC = 0x10,
+ NVME_CR_CC = 0x14,
+ NVME_CR_CSTS = 0x1c,
+ NVME_CR_NSSR = 0x20,
+ NVME_CR_AQA = 0x24,
+ NVME_CR_ASQ_LOW = 0x28,
+ NVME_CR_ASQ_HI = 0x2c,
+ NVME_CR_ACQ_LOW = 0x30,
+ NVME_CR_ACQ_HI = 0x34,
+};
+
+enum nvme_cmd_cdw11 {
+ NVME_CMD_CDW11_PC = 0x0001,
+ NVME_CMD_CDW11_IEN = 0x0002,
+ NVME_CMD_CDW11_IV = 0xFFFF0000,
+};
+
+#define NVME_CQ_INTEN 0x01
+#define NVME_CQ_INTCOAL 0x02
+
+struct nvme_completion_queue {
+ struct nvme_completion *qbase;
+ uint32_t size;
+ uint16_t tail; /* nvme progress */
+ uint16_t head; /* guest progress */
+ uint16_t intr_vec;
+ uint32_t intr_en;
+ pthread_mutex_t mtx;
+};
+
+struct nvme_submission_queue {
+ struct nvme_command *qbase;
+ uint32_t size;
+ uint16_t head; /* nvme progress */
+ uint16_t tail; /* guest progress */
+ uint16_t cqid; /* completion queue id */
+ int busy; /* queue is being processed */
+ int qpriority;
+};
+
+enum nvme_storage_type {
+ NVME_STOR_BLOCKIF = 0,
+ NVME_STOR_RAM = 1,
+};
+
+struct pci_nvme_blockstore {
+ enum nvme_storage_type type;
+ void *ctx;
+ uint64_t size;
+ uint32_t sectsz;
+ uint32_t sectsz_bits;
+};
+
+struct pci_nvme_ioreq {
+ struct pci_nvme_softc *sc;
+ struct pci_nvme_ioreq *next;
+ struct nvme_submission_queue *nvme_sq;
+ uint16_t sqid;
+
+ /* command information */
+ uint16_t opc;
+ uint16_t cid;
+ uint32_t nsid;
+
+ uint64_t prev_gpaddr;
+ size_t prev_size;
+
+ /*
+ * lock if all iovs consumed (big IO);
+ * complete transaction before continuing
+ */
+ pthread_mutex_t mtx;
+ pthread_cond_t cv;
+
+ struct blockif_req io_req;
+
+ /* pad to fit up to 512 page descriptors from guest IO request */
+ struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
+};
+
+struct pci_nvme_softc {
+ struct pci_devinst *nsc_pi;
+
+ pthread_mutex_t mtx;
+
+ struct nvme_registers regs;
+
+ struct nvme_namespace_data nsdata;
+ struct nvme_controller_data ctrldata;
+
+ struct pci_nvme_blockstore nvstore;
+
+ uint16_t max_qentries; /* max entries per queue */
+ uint32_t max_queues; /* max number of IO SQ's or CQ's */
+ uint32_t num_cqueues;
+ uint32_t num_squeues;
+
+ struct pci_nvme_ioreq *ioreqs;
+ struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
+ uint32_t pending_ios;
+ uint32_t ioslots;
+ sem_t iosemlock;
+
+ /*
+ * Memory mapped Submission and Completion queues
+ * Each array includes both Admin and IO queues
+ */
+ struct nvme_completion_queue *compl_queues;
+ struct nvme_submission_queue *submit_queues;
+
+ /* controller features */
+ uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
+ uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
+ uint32_t async_ev_config; /* 0x0B: async event config */
+};
+
+
+static void pci_nvme_io_partial(struct blockif_req *br, int err);
+
+/* Controller Configuration utils */
+#define NVME_CC_GET_EN(cc) \
+ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
+#define NVME_CC_GET_CSS(cc) \
+ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
+#define NVME_CC_GET_SHN(cc) \
+ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
+#define NVME_CC_GET_IOSQES(cc) \
+ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
+#define NVME_CC_GET_IOCQES(cc) \
+ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
+
+#define NVME_CC_WRITE_MASK \
+ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
+ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
+ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
+
+#define NVME_CC_NEN_WRITE_MASK \
+ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
+ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
+ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
+
+/* Controller Status utils */
+#define NVME_CSTS_GET_RDY(sts) \
+ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
+
+#define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
+
+/* Completion Queue status word utils */
+#define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
+#define NVME_STATUS_MASK \
+ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
+ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
+
+static __inline void
+cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
+{
+ size_t len;
+
+ len = strnlen(src, dst_size);
+ memset(dst, pad, dst_size);
+ memcpy(dst, src, len);
+}
+
+static __inline void
+pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
+{
+
+ *status &= ~NVME_STATUS_MASK;
+ *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
+ (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
+}
+
+static __inline void
+pci_nvme_status_genc(uint16_t *status, uint16_t code)
+{
+
+ pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
+}
+
+static __inline void
+pci_nvme_toggle_phase(uint16_t *status, int prev)
+{
+
+ if (prev)
+ *status &= ~NVME_STATUS_P;
+ else
+ *status |= NVME_STATUS_P;
+}
+
+static void
+pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
+{
+ struct nvme_controller_data *cd = &sc->ctrldata;
+
+ cd->vid = 0xFB5D;
+ cd->ssvid = 0x0000;
+
+ cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
+ cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
+
+ /* Num of submission commands that we can handle at a time (2^rab) */
+ cd->rab = 4;
+
+ /* FreeBSD OUI */
+ cd->ieee[0] = 0x58;
+ cd->ieee[1] = 0x9c;
+ cd->ieee[2] = 0xfc;
+
+ cd->mic = 0;
+
+ cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
+
+ cd->ver = 0x00010300;
+
+ cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
+ cd->acl = 2;
+ cd->aerl = 4;
+
+ cd->lpa = 0; /* TODO: support some simple things like SMART */
+ cd->elpe = 0; /* max error log page entries */
+ cd->npss = 1; /* number of power states support */
+
+ /* Warning Composite Temperature Threshold */
+ cd->wctemp = 0x0157;
+
+ cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
+ (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
+ cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
+ (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
+ cd->nn = 1; /* number of namespaces */
+
+ cd->fna = 0x03;
+
+ cd->power_state[0].mp = 10;
+}
+
+static void
+pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
+{
+ struct nvme_namespace_data *nd;
+
+ nd = &sc->nsdata;
+
+ nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
+ nd->ncap = nd->nsze;
+ nd->nuse = nd->nsze;
+
+ /* Get LBA and backstore information from backing store */
+ nd->nlbaf = 1;
+ /* LBA data-sz = 2^lbads */
+ nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
+
+ nd->flbas = 0;
+}
+
+static void
+pci_nvme_reset_locked(struct pci_nvme_softc *sc)
+{
+ DPRINTF(("%s\r\n", __func__));
+
+ sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
+ (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
+ (60 << NVME_CAP_LO_REG_TO_SHIFT);
+
+ sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
+
+ sc->regs.vs = 0x00010300; /* NVMe v1.3 */
+
+ sc->regs.cc = 0;
+ sc->regs.csts = 0;
+
+ sc->num_cqueues = sc->num_squeues = sc->max_queues;
+ if (sc->submit_queues != NULL) {
+ for (int i = 0; i < sc->num_squeues + 1; i++) {
+ /*
+ * The Admin Submission Queue is at index 0.
+ * It must not be changed at reset otherwise the
+ * emulation will be out of sync with the guest.
+ */
+ if (i != 0) {
+ sc->submit_queues[i].qbase = NULL;
+ sc->submit_queues[i].size = 0;
+ sc->submit_queues[i].cqid = 0;
+ }
+ sc->submit_queues[i].tail = 0;
+ sc->submit_queues[i].head = 0;
+ sc->submit_queues[i].busy = 0;
+ }
+ } else
+ sc->submit_queues = calloc(sc->num_squeues + 1,
+ sizeof(struct nvme_submission_queue));
+
+ if (sc->compl_queues != NULL) {
+ for (int i = 0; i < sc->num_cqueues + 1; i++) {
+ /* See Admin Submission Queue note above */
+ if (i != 0) {
+ sc->compl_queues[i].qbase = NULL;
+ sc->compl_queues[i].size = 0;
+ }
+
+ sc->compl_queues[i].tail = 0;
+ sc->compl_queues[i].head = 0;
+ }
+ } else {
+ sc->compl_queues = calloc(sc->num_cqueues + 1,
+ sizeof(struct nvme_completion_queue));
+
+ for (int i = 0; i < sc->num_cqueues + 1; i++)
+ pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
+ }
+}
+
+static void
+pci_nvme_reset(struct pci_nvme_softc *sc)
+{
+ pthread_mutex_lock(&sc->mtx);
+ pci_nvme_reset_locked(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
+{
+ uint16_t acqs, asqs;
+
+ DPRINTF(("%s\r\n", __func__));
+
+ asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
+ sc->submit_queues[0].size = asqs;
+ sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
+ sizeof(struct nvme_command) * asqs);
+
+ DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
+ __func__, sc->regs.asq, sc->submit_queues[0].qbase));
+
+ acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
+ NVME_AQA_REG_ACQS_MASK) + 1;
+ sc->compl_queues[0].size = acqs;
+ sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
+ sizeof(struct nvme_completion) * acqs);
+ DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
+ __func__, sc->regs.acq, sc->compl_queues[0].qbase));
+}
+
+static int
+nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ uint16_t qid = command->cdw10 & 0xffff;
+
+ DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
+ if (qid == 0 || qid > sc->num_squeues) {
+ WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
+ __func__, qid, sc->num_squeues));
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ sc->submit_queues[qid].qbase = NULL;
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+static int
+nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ if (command->cdw11 & NVME_CMD_CDW11_PC) {
+ uint16_t qid = command->cdw10 & 0xffff;
+ struct nvme_submission_queue *nsq;
+
+ if ((qid == 0) || (qid > sc->num_squeues)) {
+ WPRINTF(("%s queue index %u > num_squeues %u\r\n",
+ __func__, qid, sc->num_squeues));
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ nsq = &sc->submit_queues[qid];
+ nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+
+ nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(struct nvme_command) * (size_t)nsq->size);
+ nsq->cqid = (command->cdw11 >> 16) & 0xffff;
+ nsq->qpriority = (command->cdw11 >> 1) & 0x03;
+
+ DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
+ qid, nsq->size, nsq->qbase, nsq->cqid));
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+ DPRINTF(("%s completed creating IOSQ qid %u\r\n",
+ __func__, qid));
+ } else {
+ /*
+ * Guest sent non-cont submission queue request.
+ * This setting is unsupported by this emulation.
+ */
+ WPRINTF(("%s unsupported non-contig (list-based) "
+ "create i/o submission queue\r\n", __func__));
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ }
+ return (1);
+}
+
+static int
+nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ uint16_t qid = command->cdw10 & 0xffff;
+
+ DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
+ if (qid == 0 || qid > sc->num_cqueues) {
+ WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
+ __func__, qid, sc->num_cqueues));
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ sc->compl_queues[qid].qbase = NULL;
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+static int
+nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ if (command->cdw11 & NVME_CMD_CDW11_PC) {
+ uint16_t qid = command->cdw10 & 0xffff;
+ struct nvme_completion_queue *ncq;
+
+ if ((qid == 0) || (qid > sc->num_cqueues)) {
+ WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
+ __func__, qid, sc->num_cqueues));
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ ncq = &sc->compl_queues[qid];
+ ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
+ ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
+ ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+
+ ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
+ command->prp1,
+ sizeof(struct nvme_command) * (size_t)ncq->size);
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ } else {
+ /*
+ * Non-contig completion queue unsupported.
+ */
+ WPRINTF(("%s unsupported non-contig (list-based) "
+ "create i/o completion queue\r\n",
+ __func__));
+
+ /* 0x12 = Invalid Use of Controller Memory Buffer */
+ pci_nvme_status_genc(&compl->status, 0x12);
+ }
+
+ return (1);
+}
+
+static int
+nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
+ uint8_t logpage = command->cdw10 & 0xFF;
+#ifdef __FreeBSD__
+ void *data;
+#else
+ /* Our compiler grumbles about this, despite it being OK */
+ void *data = NULL;
+#endif
+
+ DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
+
+ if (logpage >= 1 && logpage <= 3)
+ data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ PAGE_SIZE);
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+ switch (logpage) {
+ case 0x01: /* Error information */
+ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+ break;
+ case 0x02: /* SMART/Health information */
+ /* TODO: present some smart info */
+ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+ break;
+ case 0x03: /* Firmware slot information */
+ memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+ break;
+ default:
+ WPRINTF(("%s get log page %x command not supported\r\n",
+ __func__, logpage));
+
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_LOG_PAGE);
+ }
+
+ return (1);
+}
+
+static int
+nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ void *dest;
+
+ DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
+ command->cdw10 & 0xFF, command->nsid));
+
+ switch (command->cdw10 & 0xFF) {
+ case 0x00: /* return Identify Namespace data structure */
+ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(sc->nsdata));
+ memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
+ break;
+ case 0x01: /* return Identify Controller data structure */
+ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(sc->ctrldata));
+ memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
+ break;
+ case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
+ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(uint32_t) * 1024);
+ ((uint32_t *)dest)[0] = 1;
+ ((uint32_t *)dest)[1] = 0;
+ break;
+ case 0x11:
+ pci_nvme_status_genc(&compl->status,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ return (1);
+ case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
+ case 0x10:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ default:
+ DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
+ __func__, command->cdw10 & 0xFF));
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+static int
+nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ uint16_t nqr; /* Number of Queues Requested */
+
+ nqr = command->cdw11 & 0xFFFF;
+ if (nqr == 0xffff) {
+ WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (-1);
+ }
+
+ sc->num_squeues = ONE_BASED(nqr);
+ if (sc->num_squeues > sc->max_queues) {
+ DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
+ sc->max_queues));
+ sc->num_squeues = sc->max_queues;
+ }
+
+ nqr = (command->cdw11 >> 16) & 0xFFFF;
+ if (nqr == 0xffff) {
+ WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (-1);
+ }
+
+ sc->num_cqueues = ONE_BASED(nqr);
+ if (sc->num_cqueues > sc->max_queues) {
+ DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
+ sc->max_queues));
+ sc->num_cqueues = sc->max_queues;
+ }
+
+ compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
+
+ return (0);
+}
+
+static int
+nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ int feature = command->cdw10 & 0xFF;
+ uint32_t iv;
+
+ DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+ compl->cdw0 = 0;
+
+ switch (feature) {
+ case NVME_FEAT_ARBITRATION:
+ DPRINTF((" arbitration 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_POWER_MANAGEMENT:
+ DPRINTF((" power management 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_LBA_RANGE_TYPE:
+ DPRINTF((" lba range 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_TEMPERATURE_THRESHOLD:
+ DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_ERROR_RECOVERY:
+ DPRINTF((" error recovery 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_VOLATILE_WRITE_CACHE:
+ DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_NUMBER_OF_QUEUES:
+ nvme_set_feature_queues(sc, command, compl);
+ break;
+ case NVME_FEAT_INTERRUPT_COALESCING:
+ DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11));
+
+ /* in uS */
+ sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
+
+ sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
+ break;
+ case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+ iv = command->cdw11 & 0xFFFF;
+
+ DPRINTF((" interrupt vector configuration 0x%x\r\n",
+ command->cdw11));
+
+ for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
+ if (sc->compl_queues[i].intr_vec == iv) {
+ if (command->cdw11 & (1 << 16))
+ sc->compl_queues[i].intr_en |=
+ NVME_CQ_INTCOAL;
+ else
+ sc->compl_queues[i].intr_en &=
+ ~NVME_CQ_INTCOAL;
+ }
+ }
+ break;
+ case NVME_FEAT_WRITE_ATOMICITY:
+ DPRINTF((" write atomicity 0x%x\r\n", command->cdw11));
+ break;
+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ DPRINTF((" async event configuration 0x%x\r\n",
+ command->cdw11));
+ sc->async_ev_config = command->cdw11;
+ break;
+ case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+ DPRINTF((" software progress marker 0x%x\r\n",
+ command->cdw11));
+ break;
+ case 0x0C:
+ DPRINTF((" autonomous power state transition 0x%x\r\n",
+ command->cdw11));
+ break;
+ default:
+ WPRINTF(("%s invalid feature\r\n", __func__));
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+static int
+nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ int feature = command->cdw10 & 0xFF;
+
+ DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+
+ compl->cdw0 = 0;
+
+ switch (feature) {
+ case NVME_FEAT_ARBITRATION:
+ DPRINTF((" arbitration\r\n"));
+ break;
+ case NVME_FEAT_POWER_MANAGEMENT:
+ DPRINTF((" power management\r\n"));
+ break;
+ case NVME_FEAT_LBA_RANGE_TYPE:
+ DPRINTF((" lba range\r\n"));
+ break;
+ case NVME_FEAT_TEMPERATURE_THRESHOLD:
+ DPRINTF((" temperature threshold\r\n"));
+ switch ((command->cdw11 >> 20) & 0x3) {
+ case 0:
+ /* Over temp threshold */
+ compl->cdw0 = 0xFFFF;
+ break;
+ case 1:
+ /* Under temp threshold */
+ compl->cdw0 = 0;
+ break;
+ default:
+ WPRINTF((" invalid threshold type select\r\n"));
+ pci_nvme_status_genc(&compl->status,
+ NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+ break;
+ case NVME_FEAT_ERROR_RECOVERY:
+ DPRINTF((" error recovery\r\n"));
+ break;
+ case NVME_FEAT_VOLATILE_WRITE_CACHE:
+ DPRINTF((" volatile write cache\r\n"));
+ break;
+ case NVME_FEAT_NUMBER_OF_QUEUES:
+ compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
+
+ DPRINTF((" number of queues (submit %u, completion %u)\r\n",
+ compl->cdw0 & 0xFFFF,
+ (compl->cdw0 >> 16) & 0xFFFF));
+
+ break;
+ case NVME_FEAT_INTERRUPT_COALESCING:
+ DPRINTF((" interrupt coalescing\r\n"));
+ break;
+ case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+ DPRINTF((" interrupt vector configuration\r\n"));
+ break;
+ case NVME_FEAT_WRITE_ATOMICITY:
+ DPRINTF((" write atomicity\r\n"));
+ break;
+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ DPRINTF((" async event configuration\r\n"));
+ sc->async_ev_config = command->cdw11;
+ break;
+ case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+ DPRINTF((" software progress marker\r\n"));
+ break;
+ case 0x0C:
+ DPRINTF((" autonomous power state transition\r\n"));
+ break;
+ default:
+ WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+static int
+nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
+ command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
+
+ /* TODO: search for the command ID and abort it */
+
+ compl->cdw0 = 1;
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ return (1);
+}
+
+#ifdef __FreeBSD__
+static int
+nvme_opc_async_event_req(struct pci_nvme_softc* sc,
+ struct nvme_command* command, struct nvme_completion* compl)
+{
+ DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
+
+ /*
+ * TODO: raise events when they happen based on the Set Features cmd.
+ * These events happen async, so only set completion successful if
+ * there is an event reflective of the request to get event.
+ */
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+ return (0);
+}
+#else
+/* This is kept behind an ifdef while it's unused to appease the compiler. */
+#endif /* __FreeBSD__ */
+
+static void
+pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
+{
+ struct nvme_completion compl;
+ struct nvme_command *cmd;
+ struct nvme_submission_queue *sq;
+ struct nvme_completion_queue *cq;
+ int do_intr = 0;
+ uint16_t sqhead;
+
+ DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
+
+ sq = &sc->submit_queues[0];
+
+ sqhead = atomic_load_acq_short(&sq->head);
+
+ if (atomic_testandset_int(&sq->busy, 1)) {
+ DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
+ __func__, sqhead, sq->tail));
+ return;
+ }
+
+ DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
+
+ while (sqhead != atomic_load_acq_short(&sq->tail)) {
+ cmd = &(sq->qbase)[sqhead];
+ compl.status = 0;
+
+ switch (cmd->opc) {
+ case NVME_OPC_DELETE_IO_SQ:
+ DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
+ do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
+ break;
+ case NVME_OPC_CREATE_IO_SQ:
+ DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
+ do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
+ break;
+ case NVME_OPC_DELETE_IO_CQ:
+ DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
+ do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
+ break;
+ case NVME_OPC_CREATE_IO_CQ:
+ DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
+ do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
+ break;
+ case NVME_OPC_GET_LOG_PAGE:
+ DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
+ do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
+ break;
+ case NVME_OPC_IDENTIFY:
+ DPRINTF(("%s command IDENTIFY\r\n", __func__));
+ do_intr |= nvme_opc_identify(sc, cmd, &compl);
+ break;
+ case NVME_OPC_ABORT:
+ DPRINTF(("%s command ABORT\r\n", __func__));
+ do_intr |= nvme_opc_abort(sc, cmd, &compl);
+ break;
+ case NVME_OPC_SET_FEATURES:
+ DPRINTF(("%s command SET_FEATURES\r\n", __func__));
+ do_intr |= nvme_opc_set_features(sc, cmd, &compl);
+ break;
+ case NVME_OPC_GET_FEATURES:
+ DPRINTF(("%s command GET_FEATURES\r\n", __func__));
+ do_intr |= nvme_opc_get_features(sc, cmd, &compl);
+ break;
+ case NVME_OPC_ASYNC_EVENT_REQUEST:
+ DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
+ /* XXX dont care, unhandled for now
+ do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
+ */
+ break;
+ default:
+ WPRINTF(("0x%x command is not implemented\r\n",
+ cmd->opc));
+ }
+
+ /* for now skip async event generation */
+ if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
+ struct nvme_completion *cp;
+ int phase;
+
+ cq = &sc->compl_queues[0];
+
+ cp = &(cq->qbase)[cq->tail];
+ cp->cdw0 = compl.cdw0;
+ cp->sqid = 0;
+ cp->sqhd = sqhead;
+ cp->cid = cmd->cid;
+
+ phase = NVME_STATUS_GET_P(cp->status);
+ cp->status = compl.status;
+ pci_nvme_toggle_phase(&cp->status, phase);
+
+ cq->tail = (cq->tail + 1) % cq->size;
+ }
+ sqhead = (sqhead + 1) % sq->size;
+ }
+
+ DPRINTF(("setting sqhead %u\r\n", sqhead));
+ atomic_store_short(&sq->head, sqhead);
+ atomic_store_int(&sq->busy, 0);
+
+ if (do_intr)
+ pci_generate_msix(sc->nsc_pi, 0);
+
+}
+
+static int
+pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
+ uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
+{
+ int iovidx;
+
+ if (req != NULL) {
+ /* concatenate contig block-iovs to minimize number of iovs */
+ if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
+ iovidx = req->io_req.br_iovcnt - 1;
+
+ req->io_req.br_iov[iovidx].iov_base =
+ paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+ req->prev_gpaddr, size);
+
+ req->prev_size += size;
+ req->io_req.br_resid += size;
+
+ req->io_req.br_iov[iovidx].iov_len = req->prev_size;
+ } else {
+ pthread_mutex_lock(&req->mtx);
+
+ iovidx = req->io_req.br_iovcnt;
+ if (iovidx == NVME_MAX_BLOCKIOVS) {
+ int err = 0;
+
+ DPRINTF(("large I/O, doing partial req\r\n"));
+
+ iovidx = 0;
+ req->io_req.br_iovcnt = 0;
+
+ req->io_req.br_callback = pci_nvme_io_partial;
+
+ if (!do_write)
+ err = blockif_read(sc->nvstore.ctx,
+ &req->io_req);
+ else
+ err = blockif_write(sc->nvstore.ctx,
+ &req->io_req);
+
+ /* wait until req completes before cont */
+ if (err == 0)
+ pthread_cond_wait(&req->cv, &req->mtx);
+ }
+ if (iovidx == 0) {
+ req->io_req.br_offset = lba;
+ req->io_req.br_resid = 0;
+ req->io_req.br_param = req;
+ }
+
+ req->io_req.br_iov[iovidx].iov_base =
+ paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+ gpaddr, size);
+
+ req->io_req.br_iov[iovidx].iov_len = size;
+
+ req->prev_gpaddr = gpaddr;
+ req->prev_size = size;
+ req->io_req.br_resid += size;
+
+ req->io_req.br_iovcnt++;
+
+ pthread_mutex_unlock(&req->mtx);
+ }
+ } else {
+ /* RAM buffer: read/write directly */
+ void *p = sc->nvstore.ctx;
+ void *gptr;
+
+ if ((lba + size) > sc->nvstore.size) {
+ WPRINTF(("%s write would overflow RAM\r\n", __func__));
+ return (-1);
+ }
+
+ p = (void *)((uintptr_t)p + (uintptr_t)lba);
+ gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
+ if (do_write)
+ memcpy(p, gptr, size);
+ else
+ memcpy(gptr, p, size);
+ }
+ return (0);
+}
+
+static void
+pci_nvme_set_completion(struct pci_nvme_softc *sc,
+ struct nvme_submission_queue *sq, int sqid, uint16_t cid,
+ uint32_t cdw0, uint16_t status, int ignore_busy)
+{
+ struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
+ struct nvme_completion *compl;
+ int do_intr = 0;
+ int phase;
+
+ DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
+ __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
+ NVME_STATUS_GET_SC(status)));
+
+ pthread_mutex_lock(&cq->mtx);
+
+ assert(cq->qbase != NULL);
+
+ compl = &cq->qbase[cq->tail];
+
+ compl->sqhd = atomic_load_acq_short(&sq->head);
+ compl->sqid = sqid;
+ compl->cid = cid;
+
+ // toggle phase
+ phase = NVME_STATUS_GET_P(compl->status);
+ compl->status = status;
+ pci_nvme_toggle_phase(&compl->status, phase);
+
+ cq->tail = (cq->tail + 1) % cq->size;
+
+ if (cq->intr_en & NVME_CQ_INTEN)
+ do_intr = 1;
+
+ pthread_mutex_unlock(&cq->mtx);
+
+ if (ignore_busy || !atomic_load_acq_int(&sq->busy))
+ if (do_intr)
+ pci_generate_msix(sc->nsc_pi, cq->intr_vec);
+}
+
+static void
+pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
+{
+ req->sc = NULL;
+ req->nvme_sq = NULL;
+ req->sqid = 0;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ req->next = sc->ioreqs_free;
+ sc->ioreqs_free = req;
+ sc->pending_ios--;
+
+ /* when no more IO pending, can set to ready if device reset/enabled */
+ if (sc->pending_ios == 0 &&
+ NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
+ sc->regs.csts |= NVME_CSTS_RDY;
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ sem_post(&sc->iosemlock);
+}
+
+static struct pci_nvme_ioreq *
+pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
+{
+ struct pci_nvme_ioreq *req = NULL;;
+
+ sem_wait(&sc->iosemlock);
+ pthread_mutex_lock(&sc->mtx);
+
+ req = sc->ioreqs_free;
+ assert(req != NULL);
+
+ sc->ioreqs_free = req->next;
+
+ req->next = NULL;
+ req->sc = sc;
+
+ sc->pending_ios++;
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ req->io_req.br_iovcnt = 0;
+ req->io_req.br_offset = 0;
+ req->io_req.br_resid = 0;
+ req->io_req.br_param = req;
+ req->prev_gpaddr = 0;
+ req->prev_size = 0;
+
+ return req;
+}
+
+static void
+pci_nvme_io_done(struct blockif_req *br, int err)
+{
+ struct pci_nvme_ioreq *req = br->br_param;
+ struct nvme_submission_queue *sq = req->nvme_sq;
+ uint16_t code, status = 0;
+
+ DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+
+ /* TODO return correct error */
+ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
+ pci_nvme_status_genc(&status, code);
+
+ pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
+ pci_nvme_release_ioreq(req->sc, req);
+}
+
+static void
+pci_nvme_io_partial(struct blockif_req *br, int err)
+{
+ struct pci_nvme_ioreq *req = br->br_param;
+
+ DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+
+ pthread_cond_signal(&req->cv);
+}
+
+
+static void
+pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
+{
+ struct nvme_submission_queue *sq;
+ uint16_t status = 0;
+ uint16_t sqhead;
+ int err;
+
+ /* handle all submissions up to sq->tail index */
+ sq = &sc->submit_queues[idx];
+
+ if (atomic_testandset_int(&sq->busy, 1)) {
+ DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
+ return;
+ }
+
+ sqhead = atomic_load_acq_short(&sq->head);
+
+ DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
+ idx, sqhead, sq->tail, sq->qbase));
+
+ while (sqhead != atomic_load_acq_short(&sq->tail)) {
+ struct nvme_command *cmd;
+ struct pci_nvme_ioreq *req = NULL;
+ uint64_t lba;
+ uint64_t nblocks, bytes, size, cpsz;
+
+ /* TODO: support scatter gather list handling */
+
+ cmd = &sq->qbase[sqhead];
+ sqhead = (sqhead + 1) % sq->size;
+
+ lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
+
+ if (cmd->opc == NVME_OPC_FLUSH) {
+ pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+ pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+ status, 1);
+
+ continue;
+ } else if (cmd->opc == 0x08) {
+ /* TODO: write zeroes */
+ WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
+ __func__, lba, cmd->cdw12 & 0xFFFF));
+ pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+ pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+ status, 1);
+
+ continue;
+ }
+
+ nblocks = (cmd->cdw12 & 0xFFFF) + 1;
+
+ bytes = nblocks * sc->nvstore.sectsz;
+
+ if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
+ req = pci_nvme_get_ioreq(sc);
+ req->nvme_sq = sq;
+ req->sqid = idx;
+ }
+
+ /*
+ * If data starts mid-page and flows into the next page, then
+ * increase page count
+ */
+
+ DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
+ "(%lu-bytes)\r\n",
+ sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
+ cmd->opc == NVME_OPC_WRITE ?
+ "WRITE" : "READ",
+ lba, nblocks, bytes));
+
+ cmd->prp1 &= ~(0x03UL);
+ cmd->prp2 &= ~(0x03UL);
+
+ DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
+
+ size = bytes;
+ lba *= sc->nvstore.sectsz;
+
+ cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
+
+ if (cpsz > bytes)
+ cpsz = bytes;
+
+ if (req != NULL) {
+ req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
+ cmd->cdw10;
+ req->opc = cmd->opc;
+ req->cid = cmd->cid;
+ req->nsid = cmd->nsid;
+ }
+
+ err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
+ cmd->opc == NVME_OPC_WRITE, lba);
+ lba += cpsz;
+ size -= cpsz;
+
+ if (size == 0)
+ goto iodone;
+
+ if (size <= PAGE_SIZE) {
+ /* prp2 is second (and final) page in transfer */
+
+ err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
+ size,
+ cmd->opc == NVME_OPC_WRITE,
+ lba);
+ } else {
+ uint64_t *prp_list;
+ int i;
+
+ /* prp2 is pointer to a physical region page list */
+ prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
+ cmd->prp2, PAGE_SIZE);
+
+ i = 0;
+ while (size != 0) {
+ cpsz = MIN(size, PAGE_SIZE);
+
+ /*
+ * Move to linked physical region page list
+ * in last item.
+ */
+ if (i == (NVME_PRP2_ITEMS-1) &&
+ size > PAGE_SIZE) {
+ assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
+ prp_list = paddr_guest2host(
+ sc->nsc_pi->pi_vmctx,
+ prp_list[i], PAGE_SIZE);
+ i = 0;
+ }
+ if (prp_list[i] == 0) {
+ WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
+ err = 1;
+ break;
+ }
+
+ err = pci_nvme_append_iov_req(sc, req,
+ prp_list[i], cpsz,
+ cmd->opc == NVME_OPC_WRITE, lba);
+ if (err)
+ break;
+
+ lba += cpsz;
+ size -= cpsz;
+ i++;
+ }
+ }
+
+iodone:
+ if (sc->nvstore.type == NVME_STOR_RAM) {
+ uint16_t code, status = 0;
+
+ code = err ? NVME_SC_LBA_OUT_OF_RANGE :
+ NVME_SC_SUCCESS;
+ pci_nvme_status_genc(&status, code);
+
+ pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+ status, 1);
+
+ continue;
+ }
+
+
+ if (err)
+ goto do_error;
+
+ req->io_req.br_callback = pci_nvme_io_done;
+
+ err = 0;
+ switch (cmd->opc) {
+ case NVME_OPC_READ:
+ err = blockif_read(sc->nvstore.ctx, &req->io_req);
+ break;
+ case NVME_OPC_WRITE:
+ err = blockif_write(sc->nvstore.ctx, &req->io_req);
+ break;
+ default:
+ WPRINTF(("%s unhandled io command 0x%x\r\n",
+ __func__, cmd->opc));
+ err = 1;
+ }
+
+do_error:
+ if (err) {
+ uint16_t status = 0;
+
+ pci_nvme_status_genc(&status,
+ NVME_SC_DATA_TRANSFER_ERROR);
+
+ pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+ status, 1);
+ pci_nvme_release_ioreq(sc, req);
+ }
+ }
+
+ atomic_store_short(&sq->head, sqhead);
+ atomic_store_int(&sq->busy, 0);
+}
+
+static void
+pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
+ uint64_t idx, int is_sq, uint64_t value)
+{
+ DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
+ idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
+
+ if (is_sq) {
+ atomic_store_short(&sc->submit_queues[idx].tail,
+ (uint16_t)value);
+
+ if (idx == 0) {
+ pci_nvme_handle_admin_cmd(sc, value);
+ } else {
+ /* submission queue; handle new entries in SQ */
+ if (idx > sc->num_squeues) {
+ WPRINTF(("%s SQ index %lu overflow from "
+ "guest (max %u)\r\n",
+ __func__, idx, sc->num_squeues));
+ return;
+ }
+ pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
+ }
+ } else {
+ if (idx > sc->num_cqueues) {
+ WPRINTF(("%s queue index %lu overflow from "
+ "guest (max %u)\r\n",
+ __func__, idx, sc->num_cqueues));
+ return;
+ }
+
+ sc->compl_queues[idx].head = (uint16_t)value;
+ }
+}
+
+static void
+pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
+{
+ const char *s = iswrite ? "WRITE" : "READ";
+
+ switch (offset) {
+ case NVME_CR_CAP_LOW:
+ DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
+ break;
+ case NVME_CR_CAP_HI:
+ DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
+ break;
+ case NVME_CR_VS:
+ DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
+ break;
+ case NVME_CR_INTMS:
+ DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
+ break;
+ case NVME_CR_INTMC:
+ DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
+ break;
+ case NVME_CR_CC:
+ DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
+ break;
+ case NVME_CR_CSTS:
+ DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
+ break;
+ case NVME_CR_NSSR:
+ DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
+ break;
+ case NVME_CR_AQA:
+ DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
+ break;
+ case NVME_CR_ASQ_LOW:
+ DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
+ break;
+ case NVME_CR_ASQ_HI:
+ DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
+ break;
+ case NVME_CR_ACQ_LOW:
+ DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
+ break;
+ case NVME_CR_ACQ_HI:
+ DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
+ break;
+ default:
+ DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
+ }
+
+}
+
+static void
+pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
+ uint64_t offset, int size, uint64_t value)
+{
+ uint32_t ccreg;
+
+ if (offset >= NVME_DOORBELL_OFFSET) {
+ uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
+ uint64_t idx = belloffset / 8; /* door bell size = 2*int */
+ int is_sq = (belloffset % 8) < 4;
+
+ if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
+ WPRINTF(("guest attempted an overflow write offset "
+ "0x%lx, val 0x%lx in %s",
+ offset, value, __func__));
+ return;
+ }
+
+ pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
+ return;
+ }
+
+ DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
+ offset, size, value));
+
+ if (size != 4) {
+ WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
+ "val 0x%lx) to bar0 in %s",
+ size, offset, value, __func__));
+ /* TODO: shutdown device */
+ return;
+ }
+
+ pci_nvme_bar0_reg_dumps(__func__, offset, 1);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ switch (offset) {
+ case NVME_CR_CAP_LOW:
+ case NVME_CR_CAP_HI:
+ /* readonly */
+ break;
+ case NVME_CR_VS:
+ /* readonly */
+ break;
+ case NVME_CR_INTMS:
+ /* MSI-X, so ignore */
+ break;
+ case NVME_CR_INTMC:
+ /* MSI-X, so ignore */
+ break;
+ case NVME_CR_CC:
+ ccreg = (uint32_t)value;
+
+ DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
+ "iocqes %u\r\n",
+ __func__,
+ NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
+ NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
+ NVME_CC_GET_IOCQES(ccreg)));
+
+ if (NVME_CC_GET_SHN(ccreg)) {
+ /* perform shutdown - flush out data to backend */
+ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
+ NVME_CSTS_REG_SHST_SHIFT);
+ sc->regs.csts |= NVME_SHST_COMPLETE <<
+ NVME_CSTS_REG_SHST_SHIFT;
+ }
+ if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
+ if (NVME_CC_GET_EN(ccreg) == 0)
+ /* transition 1-> causes controller reset */
+ pci_nvme_reset_locked(sc);
+ else
+ pci_nvme_init_controller(ctx, sc);
+ }
+
+ /* Insert the iocqes, iosqes and en bits from the write */
+ sc->regs.cc &= ~NVME_CC_WRITE_MASK;
+ sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
+ if (NVME_CC_GET_EN(ccreg) == 0) {
+ /* Insert the ams, mps and css bit fields */
+ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
+ sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
+ sc->regs.csts &= ~NVME_CSTS_RDY;
+ } else if (sc->pending_ios == 0) {
+ sc->regs.csts |= NVME_CSTS_RDY;
+ }
+ break;
+ case NVME_CR_CSTS:
+ break;
+ case NVME_CR_NSSR:
+ /* ignore writes; don't support subsystem reset */
+ break;
+ case NVME_CR_AQA:
+ sc->regs.aqa = (uint32_t)value;
+ break;
+ case NVME_CR_ASQ_LOW:
+ sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
+ (0xFFFFF000 & value);
+ break;
+ case NVME_CR_ASQ_HI:
+ sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
+ (value << 32);
+ break;
+ case NVME_CR_ACQ_LOW:
+ sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
+ (0xFFFFF000 & value);
+ break;
+ case NVME_CR_ACQ_HI:
+ sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
+ (value << 32);
+ break;
+ default:
+ DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
+ __func__, offset, value, size));
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_nvme_softc* sc = pi->pi_arg;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
+ " value 0x%lx\r\n", baridx, offset, size, value));
+
+ pci_emul_msix_twrite(pi, offset, size, value);
+ return;
+ }
+
+ switch (baridx) {
+ case 0:
+ pci_nvme_write_bar_0(ctx, sc, offset, size, value);
+ break;
+
+ default:
+ DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
+ __func__, baridx, value));
+ }
+}
+
+static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
+ uint64_t offset, int size)
+{
+ uint64_t value;
+
+ pci_nvme_bar0_reg_dumps(__func__, offset, 0);
+
+ if (offset < NVME_DOORBELL_OFFSET) {
+ void *p = &(sc->regs);
+ pthread_mutex_lock(&sc->mtx);
+ memcpy(&value, (void *)((uintptr_t)p + offset), size);
+ pthread_mutex_unlock(&sc->mtx);
+ } else {
+ value = 0;
+ WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
+ }
+
+ switch (size) {
+ case 1:
+ value &= 0xFF;
+ break;
+ case 2:
+ value &= 0xFFFF;
+ break;
+ case 4:
+ value &= 0xFFFFFFFF;
+ break;
+ }
+
+ DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
+ offset, size, (uint32_t)value));
+
+ return (value);
+}
+
+
+
+static uint64_t
+pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_nvme_softc* sc = pi->pi_arg;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
+ baridx, offset, size));
+
+ return pci_emul_msix_tread(pi, offset, size);
+ }
+
+ switch (baridx) {
+ case 0:
+ return pci_nvme_read_bar_0(sc, offset, size);
+
+ default:
+ DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
+ }
+
+ return (0);
+}
+
+
+static int
+pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
+{
+ char bident[sizeof("XX:X:X")];
+ char *uopt, *xopts, *config;
+ uint32_t sectsz;
+ int optidx;
+
+ sc->max_queues = NVME_QUEUES;
+ sc->max_qentries = NVME_MAX_QENTRIES;
+ sc->ioslots = NVME_IOSLOTS;
+ sc->num_squeues = sc->max_queues;
+ sc->num_cqueues = sc->max_queues;
+ sectsz = 0;
+
+ uopt = strdup(opts);
+ optidx = 0;
+ snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
+ "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+ for (xopts = strtok(uopt, ",");
+ xopts != NULL;
+ xopts = strtok(NULL, ",")) {
+
+ if ((config = strchr(xopts, '=')) != NULL)
+ *config++ = '\0';
+
+ if (!strcmp("maxq", xopts)) {
+ sc->max_queues = atoi(config);
+ } else if (!strcmp("qsz", xopts)) {
+ sc->max_qentries = atoi(config);
+ } else if (!strcmp("ioslots", xopts)) {
+ sc->ioslots = atoi(config);
+ } else if (!strcmp("sectsz", xopts)) {
+ sectsz = atoi(config);
+ } else if (!strcmp("ser", xopts)) {
+ /*
+ * This field indicates the Product Serial Number in
+ * 7-bit ASCII, unused bytes should be space characters.
+ * Ref: NVMe v1.3c.
+ */
+ cpywithpad((char *)sc->ctrldata.sn,
+ sizeof(sc->ctrldata.sn), config, ' ');
+ } else if (!strcmp("ram", xopts)) {
+ uint64_t sz = strtoull(&xopts[4], NULL, 10);
+
+ sc->nvstore.type = NVME_STOR_RAM;
+ sc->nvstore.size = sz * 1024 * 1024;
+ sc->nvstore.ctx = calloc(1, sc->nvstore.size);
+ sc->nvstore.sectsz = 4096;
+ sc->nvstore.sectsz_bits = 12;
+ if (sc->nvstore.ctx == NULL) {
+ perror("Unable to allocate RAM");
+ free(uopt);
+ return (-1);
+ }
+ } else if (optidx == 0) {
+ snprintf(bident, sizeof(bident), "%d:%d",
+ sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+ sc->nvstore.ctx = blockif_open(xopts, bident);
+ if (sc->nvstore.ctx == NULL) {
+ perror("Could not open backing file");
+ free(uopt);
+ return (-1);
+ }
+ sc->nvstore.type = NVME_STOR_BLOCKIF;
+ sc->nvstore.size = blockif_size(sc->nvstore.ctx);
+ } else {
+ fprintf(stderr, "Invalid option %s\n", xopts);
+ free(uopt);
+ return (-1);
+ }
+
+ optidx++;
+ }
+ free(uopt);
+
+ if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
+ fprintf(stderr, "backing store not specified\n");
+ return (-1);
+ }
+ if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
+ sc->nvstore.sectsz = sectsz;
+ else if (sc->nvstore.type != NVME_STOR_RAM)
+ sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
+ for (sc->nvstore.sectsz_bits = 9;
+ (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
+ sc->nvstore.sectsz_bits++);
+
+ if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
+ sc->max_queues = NVME_QUEUES;
+
+ if (sc->max_qentries <= 0) {
+ fprintf(stderr, "Invalid qsz option\n");
+ return (-1);
+ }
+ if (sc->ioslots <= 0) {
+ fprintf(stderr, "Invalid ioslots option\n");
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_nvme_softc *sc;
+ uint32_t pci_membar_sz;
+ int error;
+
+ error = 0;
+
+ sc = calloc(1, sizeof(struct pci_nvme_softc));
+ pi->pi_arg = sc;
+ sc->nsc_pi = pi;
+
+ error = pci_nvme_parse_opts(sc, opts);
+ if (error < 0)
+ goto done;
+ else
+ error = 0;
+
+ sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
+ for (int i = 0; i < sc->ioslots; i++) {
+ if (i < (sc->ioslots-1))
+ sc->ioreqs[i].next = &sc->ioreqs[i+1];
+ pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
+ pthread_cond_init(&sc->ioreqs[i].cv, NULL);
+ }
+ sc->ioreqs_free = sc->ioreqs;
+ sc->intr_coales_aggr_thresh = 1;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
+ pci_set_cfgdata8(pi, PCIR_PROGIF,
+ PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
+
+ /* allocate size of nvme registers + doorbell space for all queues */
+ pci_membar_sz = sizeof(struct nvme_registers) +
+ 2*sizeof(uint32_t)*(sc->max_queues + 1);
+
+ DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
+ if (error) {
+ WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
+ goto done;
+ }
+
+ error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
+ if (error) {
+ WPRINTF(("%s pci add msixcap failed\r\n", __func__));
+ goto done;
+ }
+
+ pthread_mutex_init(&sc->mtx, NULL);
+ sem_init(&sc->iosemlock, 0, sc->ioslots);
+
+ pci_nvme_reset(sc);
+ pci_nvme_init_ctrldata(sc);
+ pci_nvme_init_nsdata(sc);
+
+ pci_lintr_request(pi);
+
+done:
+ return (error);
+}
+
+
+struct pci_devemu pci_de_nvme = {
+ .pe_emu = "nvme",
+ .pe_init = pci_nvme_init,
+ .pe_barwrite = pci_nvme_write,
+ .pe_barread = pci_nvme_read
+};
+PCI_EMUL_SET(pci_de_nvme);
diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c
new file mode 100644
index 0000000000..3782914cd5
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_passthru.c
@@ -0,0 +1,910 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <sys/pci.h>
+
+#include <dev/io/iodev.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/iodev.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include <sys/ppt_dev.h>
+#include "pci_emul.h"
+#include "mem.h"
+
+#define LEGACY_SUPPORT 1
+
+#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
+#define MSIX_CAPLEN 12
+
+struct passthru_softc {
+ struct pci_devinst *psc_pi;
+ struct pcibar psc_bar[PCI_BARMAX + 1];
+ struct {
+ int capoff;
+ int msgctrl;
+ int emulated;
+ } psc_msi;
+ struct {
+ int capoff;
+ } psc_msix;
+ int pptfd;
+ int msi_limit;
+ int msix_limit;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+ int len;
+
+ len = 10; /* minimum length of msi capability */
+
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ len += 4;
+
+#if 0
+ /*
+ * Ignore the 'mask' and 'pending' bits in the MSI capability.
+ * We'll let the guest manipulate them directly.
+ */
+ if (msgctrl & PCIM_MSICTRL_VECTOR)
+ len += 10;
+#endif
+
+ return (len);
+}
+
+static uint32_t
+read_config(const struct passthru_softc *sc, long reg, int width)
+{
+ struct ppt_cfg_io pi;
+
+ pi.pci_off = reg;
+ pi.pci_width = width;
+
+ if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
+ return (0);
+ }
+ return (pi.pci_data);
+}
+
+static void
+write_config(const struct passthru_softc *sc, long reg, int width,
+ uint32_t data)
+{
+ struct ppt_cfg_io pi;
+
+ pi.pci_off = reg;
+ pi.pci_width = width;
+ pi.pci_data = data;
+
+ (void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
+}
+
+static int
+passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
+ uint64_t *base, uint64_t *size)
+{
+ struct ppt_bar_query pb;
+
+ pb.pbq_baridx = bar;
+
+ if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
+ return (-1);
+ }
+
+ switch (pb.pbq_type) {
+ case PCI_ADDR_IO:
+ *type = PCIBAR_IO;
+ break;
+ case PCI_ADDR_MEM32:
+ *type = PCIBAR_MEM32;
+ break;
+ case PCI_ADDR_MEM64:
+ *type = PCIBAR_MEM64;
+ break;
+ default:
+ err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
+ break;
+ }
+
+ *base = pb.pbq_base;
+ *size = pb.pbq_size;
+ return (0);
+}
+
+static int
+passthru_dev_open(const char *path, int *pptfdp)
+{
+ int pptfd;
+
+ if ((pptfd = open(path, O_RDWR)) < 0) {
+ return (errno);
+ }
+
+ /* XXX: verify fd with ioctl? */
+ *pptfdp = pptfd;
+ return (0);
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+ int capoff, i;
+ struct msicap msicap;
+ u_char *capdata;
+
+ pci_populate_msicap(&msicap, msgnum, nextptr);
+
+ /*
+ * XXX
+ * Copy the msi capability structure in the last 16 bytes of the
+ * config space. This is wrong because it could shadow something
+ * useful to the device.
+ */
+ capoff = 256 - roundup(sizeof(msicap), 4);
+ capdata = (u_char *)&msicap;
+ for (i = 0; i < sizeof(msicap); i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ return (capoff);
+}
+#endif /* LEGACY_SUPPORT */
+
+static void
+passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
+{
+ struct pci_devinst *pi = sc->psc_pi;
+ int off;
+
+ /* Reduce the number of MSI vectors if higher than OS limit */
+ if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
+ int msi_limit, mmc;
+
+ msi_limit =
+ sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
+ sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
+ sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
+ sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
+ sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
+ PCIM_MSICTRL_MMC_1;
+ mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
+
+ if (mmc > msi_limit) {
+ sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
+ sc->psc_msi.msgctrl |= msi_limit;
+ pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
+ }
+ }
+
+ /* Reduce the number of MSI-X vectors if higher than OS limit */
+ if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
+ if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
+ msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
+ msixcap->msgctrl |= sc->msix_limit - 1;
+ pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
+ }
+ }
+}
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+ int i, ptr, capptr, cap, sts, caplen, table_size;
+ uint32_t u32;
+ struct pci_devinst *pi = sc->psc_pi;
+ struct msixcap msixcap;
+ uint32_t *msixcap_ptr;
+
+ /*
+ * Parse the capabilities and cache the location of the MSI
+ * and MSI-X capabilities.
+ */
+ sts = read_config(sc, PCIR_STATUS, 2);
+ if (sts & PCIM_STATUS_CAPPRESENT) {
+ ptr = read_config(sc, PCIR_CAP_PTR, 1);
+ while (ptr != 0 && ptr != 0xff) {
+ cap = read_config(sc, ptr + PCICAP_ID, 1);
+ if (cap == PCIY_MSI) {
+ /*
+ * Copy the MSI capability into the config
+ * space of the emulated pci device
+ */
+ sc->psc_msi.capoff = ptr;
+ sc->psc_msi.msgctrl = read_config(sc,
+ ptr + 2, 2);
+ sc->psc_msi.emulated = 0;
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(sc, capptr, 4);
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ }
+ } else if (cap == PCIY_MSIX) {
+ /*
+ * Copy the MSI-X capability
+ */
+ sc->psc_msix.capoff = ptr;
+ caplen = 12;
+ msixcap_ptr = (uint32_t*) &msixcap;
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(sc, capptr, 4);
+ *msixcap_ptr = u32;
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ msixcap_ptr++;
+ }
+ }
+ ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
+ }
+ }
+
+ passthru_intr_limit(sc, &msixcap);
+
+ if (sc->psc_msix.capoff != 0) {
+ pi->pi_msix.pba_bar =
+ msixcap.pba_info & PCIM_MSIX_BIR_MASK;
+ pi->pi_msix.pba_offset =
+ msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
+ pi->pi_msix.table_bar =
+ msixcap.table_info & PCIM_MSIX_BIR_MASK;
+ pi->pi_msix.table_offset =
+ msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
+ pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+ pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
+
+ /* Allocate the emulated MSI-X table array */
+ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table = calloc(1, table_size);
+
+ /* Mask all table entries */
+ for (i = 0; i < pi->pi_msix.table_count; i++) {
+ pi->pi_msix.table[i].vector_control |=
+ PCIM_MSIX_VCTRL_MASK;
+ }
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If the passthrough device does not support MSI then craft a
+ * MSI capability for it. We link the new MSI capability at the
+ * head of the list of capabilities.
+ */
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+ int origptr, msiptr;
+ origptr = read_config(sc, PCIR_CAP_PTR, 1);
+ msiptr = passthru_add_msicap(pi, 1, origptr);
+ sc->psc_msi.capoff = msiptr;
+ sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+ sc->psc_msi.emulated = 1;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+ }
+#endif
+
+ /* Make sure one of the capabilities is present */
+ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) {
+ return (-1);
+ } else {
+ return (0);
+ }
+}
+
+static uint64_t
+passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint8_t *src8;
+ uint16_t *src16;
+ uint32_t *src32;
+ uint64_t *src64;
+ uint64_t data;
+ size_t entry_offset;
+ int index;
+
+ pi = sc->psc_pi;
+ if (offset >= pi->pi_msix.pba_offset &&
+ offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ switch(size) {
+ case 1:
+ src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ data = *src8;
+ break;
+ case 2:
+ src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ data = *src16;
+ break;
+ case 4:
+ src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ data = *src32;
+ break;
+ case 8:
+ src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ data = *src64;
+ break;
+ default:
+ return (-1);
+ }
+ return (data);
+ }
+
+ if (offset < pi->pi_msix.table_offset)
+ return (-1);
+
+ offset -= pi->pi_msix.table_offset;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (index >= pi->pi_msix.table_count)
+ return (-1);
+
+ entry = &pi->pi_msix.table[index];
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ switch(size) {
+ case 1:
+ src8 = (uint8_t *)((void *)entry + entry_offset);
+ data = *src8;
+ break;
+ case 2:
+ src16 = (uint16_t *)((void *)entry + entry_offset);
+ data = *src16;
+ break;
+ case 4:
+ src32 = (uint32_t *)((void *)entry + entry_offset);
+ data = *src32;
+ break;
+ case 8:
+ src64 = (uint64_t *)((void *)entry + entry_offset);
+ data = *src64;
+ break;
+ default:
+ return (-1);
+ }
+
+ return (data);
+}
+
+static void
+passthru_msix_table_write(struct vmctx *ctx, int vcpu,
+ struct passthru_softc *sc, uint64_t offset, int size, uint64_t data)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint8_t *dest8;
+ uint16_t *dest16;
+ uint32_t *dest32;
+ uint64_t *dest64;
+ size_t entry_offset;
+ uint32_t vector_control;
+ int index;
+
+ pi = sc->psc_pi;
+ if (offset >= pi->pi_msix.pba_offset &&
+ offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ switch(size) {
+ case 1:
+ dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ *dest8 = data;
+ break;
+ case 2:
+ dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ *dest16 = data;
+ break;
+ case 4:
+ dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ *dest32 = data;
+ break;
+ case 8:
+ dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
+ pi->pi_msix.pba_page_offset);
+ *dest64 = data;
+ break;
+ default:
+ break;
+ }
+ return;
+ }
+
+ if (offset < pi->pi_msix.table_offset)
+ return;
+
+ offset -= pi->pi_msix.table_offset;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (index >= pi->pi_msix.table_count)
+ return;
+
+ entry = &pi->pi_msix.table[index];
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* Only 4 byte naturally-aligned writes are supported */
+ assert(size == 4);
+ assert(entry_offset % 4 == 0);
+
+ vector_control = entry->vector_control;
+ dest32 = (uint32_t *)((void *)entry + entry_offset);
+ *dest32 = data;
+ /* If MSI-X hasn't been enabled, do nothing */
+ if (pi->pi_msix.enabled) {
+ /* If the entry is masked, don't set it up */
+ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+ (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ (void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
+ index, entry->addr, entry->msg_data,
+ entry->vector_control);
+ }
+ }
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+ int error, idx;
+ size_t len, remaining;
+ uint32_t table_size, table_offset;
+ uint32_t pba_size, pba_offset;
+ vm_paddr_t start;
+ struct pci_devinst *pi = sc->psc_pi;
+
+ assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
+
+ /*
+ * If the MSI-X table BAR maps memory intended for
+ * other uses, it is at least assured that the table
+ * either resides in its own page within the region,
+ * or it resides in a page shared with only the PBA.
+ */
+ table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
+
+ table_size = pi->pi_msix.table_offset - table_offset;
+ table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+ table_size = roundup2(table_size, 4096);
+
+ idx = pi->pi_msix.table_bar;
+ start = pi->pi_bar[idx].addr;
+ remaining = pi->pi_bar[idx].size;
+
+ if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
+ pba_offset = pi->pi_msix.pba_offset;
+ pba_size = pi->pi_msix.pba_size;
+ if (pba_offset >= table_offset + table_size ||
+ table_offset >= pba_offset + pba_size) {
+ /*
+ * If the PBA does not share a page with the MSI-x
+ * tables, no PBA emulation is required.
+ */
+ pi->pi_msix.pba_page = NULL;
+ pi->pi_msix.pba_page_offset = 0;
+ } else {
+ /*
+ * The PBA overlaps with either the first or last
+ * page of the MSI-X table region. Map the
+ * appropriate page.
+ */
+ if (pba_offset <= table_offset)
+ pi->pi_msix.pba_page_offset = table_offset;
+ else
+ pi->pi_msix.pba_page_offset = table_offset +
+ table_size - 4096;
+ pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
+ PROT_WRITE, MAP_SHARED, sc->pptfd,
+ pi->pi_msix.pba_page_offset);
+ if (pi->pi_msix.pba_page == MAP_FAILED) {
+ warn("Failed to map PBA page for MSI-X on %d",
+ sc->pptfd);
+ return (-1);
+ }
+ }
+ }
+
+ /* Map everything before the MSI-X table */
+ if (table_offset > 0) {
+ len = table_offset;
+ error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
+ if (error)
+ return (error);
+
+ base += len;
+ start += len;
+ remaining -= len;
+ }
+
+ /* Skip the MSI-X table */
+ base += table_size;
+ start += table_size;
+ remaining -= table_size;
+
+ /* Map everything beyond the end of the MSI-X table */
+ if (remaining > 0) {
+ len = remaining;
+ error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ struct pci_devinst *pi = sc->psc_pi;
+ uint_t i;
+
+ /*
+ * Initialize BAR registers
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ enum pcibar_type bartype;
+ uint64_t base, size;
+ int error;
+
+ if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
+ continue;
+ }
+
+ if (bartype != PCIBAR_IO) {
+ if (((base | size) & PAGE_MASK) != 0) {
+ warnx("passthru device %d BAR %d: "
+ "base %#lx or size %#lx not page aligned\n",
+ sc->pptfd, i, base, size);
+ return (-1);
+ }
+ }
+
+ /* Cache information about the "real" BAR */
+ sc->psc_bar[i].type = bartype;
+ sc->psc_bar[i].size = size;
+ sc->psc_bar[i].addr = base;
+
+ /* Allocate the BAR in the guest I/O or MMIO space */
+ error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
+ if (error)
+ return (-1);
+
+ /* The MSI-X table needs special handling */
+ if (i == pci_msix_table_bar(pi)) {
+ error = init_msix_table(ctx, sc, base);
+ if (error)
+ return (-1);
+ } else if (bartype != PCIBAR_IO) {
+ /* Map the physical BAR in the guest MMIO space */
+ error = vm_map_pptdev_mmio(ctx, sc->pptfd,
+ pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+ if (error)
+ return (-1);
+ }
+
+ /*
+ * 64-bit BAR takes up two slots so skip the next one.
+ */
+ if (bartype == PCIBAR_MEM64) {
+ i++;
+ assert(i <= PCI_BARMAX);
+ sc->psc_bar[i].type = PCIBAR_MEMHI64;
+ }
+ }
+ return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ if (cfginitmsi(sc) != 0) {
+ warnx("failed to initialize MSI for PCI %d", sc->pptfd);
+ return (-1);
+ }
+
+ if (cfginitbar(ctx, sc) != 0) {
+ warnx("failed to initialize BARs for PCI %d", sc->pptfd);
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error, memflags, pptfd;
+ struct passthru_softc *sc;
+
+ sc = NULL;
+ error = 1;
+
+ memflags = vm_get_memflags(ctx);
+ if (!(memflags & VM_MEM_F_WIRED)) {
+ warnx("passthru requires guest memory to be wired");
+ goto done;
+ }
+
+ if (opts == NULL || passthru_dev_open(opts, &pptfd) != 0) {
+ warnx("invalid passthru options");
+ goto done;
+ }
+
+ if (vm_assign_pptdev(ctx, pptfd) != 0) {
+ warnx("PCI device at %d is not using the ppt driver", pptfd);
+ goto done;
+ }
+
+ sc = calloc(1, sizeof(struct passthru_softc));
+
+ pi->pi_arg = sc;
+ sc->psc_pi = pi;
+ sc->pptfd = pptfd;
+
+ if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
+ &sc->msix_limit)) != 0)
+ goto done;
+
+ /* initialize config space */
+ if ((error = cfginit(ctx, sc)) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ if (error) {
+ free(sc);
+ vm_unassign_pptdev(ctx, pptfd);
+ }
+ return (error);
+}
+
+static int
+bar_access(int coff)
+{
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+ int caplen;
+
+ if (sc->psc_msi.capoff == 0)
+ return (0);
+
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+ if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+ if (sc->psc_msix.capoff == 0)
+ return (0);
+
+ return (coff >= sc->psc_msix.capoff &&
+ coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t *rv)
+{
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs and MSI capability is emulated.
+ */
+ if (bar_access(coff) || msicap_access(sc, coff))
+ return (-1);
+
+ /*
+ * MSI-X is also emulated since a limit on interrupts may be imposed by
+ * the OS, altering the perceived register state.
+ */
+ if (msixcap_access(sc, coff))
+ return (-1);
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+ * natively.
+ */
+ if (sc->psc_msi.emulated) {
+ if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+ return (-1);
+ }
+#endif
+
+ /* Everything else just read from the device's config space */
+ *rv = read_config(sc, coff, bytes);
+
+ return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t val)
+{
+ int error, msix_table_entries, i;
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs are emulated
+ */
+ if (bar_access(coff))
+ return (-1);
+
+ /*
+ * MSI capability is emulated
+ */
+ if (msicap_access(sc, coff)) {
+ msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+ error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
+ pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
+ if (error != 0)
+ err(1, "vm_setup_pptdev_msi");
+ return (0);
+ }
+
+ if (msixcap_access(sc, coff)) {
+ msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+ if (pi->pi_msix.enabled) {
+ msix_table_entries = pi->pi_msix.table_count;
+ for (i = 0; i < msix_table_entries; i++) {
+ error = vm_setup_pptdev_msix(ctx, vcpu,
+ sc->pptfd, i,
+ pi->pi_msix.table[i].addr,
+ pi->pi_msix.table[i].msg_data,
+ pi->pi_msix.table[i].vector_control);
+
+ if (error)
+ err(1, "vm_setup_pptdev_msix");
+ }
+ }
+ return (0);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If this device does not support MSI natively then we cannot let
+ * the guest disable legacy interrupts from the device. It is the
+ * legacy interrupt that is triggering the virtual MSI to the guest.
+ */
+ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+ if (coff == PCIR_COMMAND && bytes == 2)
+ val &= ~PCIM_CMD_INTxDIS;
+ }
+#endif
+
+ write_config(sc, coff, bytes, val);
+
+ return (0);
+}
+
+static void
+passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ struct passthru_softc *sc = pi->pi_arg;
+
+ if (baridx == pci_msix_table_bar(pi)) {
+ passthru_msix_table_write(ctx, vcpu, sc, offset, size, value);
+ } else {
+ struct ppt_bar_io pbi;
+
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+
+ pbi.pbi_bar = baridx;
+ pbi.pbi_width = size;
+ pbi.pbi_off = offset;
+ pbi.pbi_data = value;
+ (void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
+ }
+}
+
+static uint64_t
+passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct passthru_softc *sc = pi->pi_arg;
+ uint64_t val;
+
+ if (baridx == pci_msix_table_bar(pi)) {
+ val = passthru_msix_table_read(sc, offset, size);
+ } else {
+ struct ppt_bar_io pbi;
+
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+
+ pbi.pbi_bar = baridx;
+ pbi.pbi_width = size;
+ pbi.pbi_off = offset;
+ if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
+ val = pbi.pbi_data;
+ } else {
+ val = 0;
+ }
+ }
+
+ return (val);
+}
+
+struct pci_devemu passthru = {
+ .pe_emu = "passthru",
+ .pe_init = passthru_init,
+ .pe_cfgwrite = passthru_cfgwrite,
+ .pe_cfgread = passthru_cfgread,
+ .pe_barwrite = passthru_write,
+ .pe_barread = passthru_read,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr/src/cmd/bhyve/pci_uart.c b/usr/src/cmd/bhyve/pci_uart.c
new file mode 100644
index 0000000000..093d0cb361
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_uart.c
@@ -0,0 +1,121 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "uart_emul.h"
+
+/*
+ * Pick a PCI vid/did of a chip with a single uart at
+ * BAR0, that most versions of FreeBSD can understand:
+ * Siig CyberSerial 1-port.
+ */
+#define COM_VENDOR 0x131f
+#define COM_DEV 0x2000
+
+static void
+pci_uart_intr_assert(void *arg)
+{
+ struct pci_devinst *pi = arg;
+
+ pci_lintr_assert(pi);
+}
+
+static void
+pci_uart_intr_deassert(void *arg)
+{
+ struct pci_devinst *pi = arg;
+
+ pci_lintr_deassert(pi);
+}
+
+static void
+pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ uart_write(pi->pi_arg, offset, value);
+}
+
+uint64_t
+pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ uint8_t val;
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ val = uart_read(pi->pi_arg, offset);
+ return (val);
+}
+
+static int
+pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct uart_softc *sc;
+
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE);
+ pci_lintr_request(pi);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+
+ sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi);
+ pi->pi_arg = sc;
+
+ if (uart_set_backend(sc, opts) != 0) {
+ fprintf(stderr, "Unable to initialize backend '%s' for "
+ "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func);
+ return (-1);
+ }
+
+ return (0);
+}
+
+struct pci_devemu pci_de_com = {
+ .pe_emu = "uart",
+ .pe_init = pci_uart_init,
+ .pe_barwrite = pci_uart_write,
+ .pe_barread = pci_uart_read
+};
+PCI_EMUL_SET(pci_de_com);
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000000..b0c3b06187
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -0,0 +1,485 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <md5.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+#include "block_if.h"
+
+#ifdef __FreeBSD__
+#define VTBLK_RINGSZ 64
+#else
+/* Enlarge to match bigger BLOCKIF_IOV_MAX */
+#define VTBLK_RINGSZ 128
+#endif
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+#define VTBLK_S_UNSUPP 2
+
+#define VTBLK_BLK_ID_BYTES 20 + 1
+
+/* Capability bits */
+#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
+#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
+#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */
+#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( VTBLK_F_SEG_MAX | \
+ VTBLK_F_BLK_SIZE | \
+ VTBLK_F_FLUSH | \
+ VTBLK_F_TOPOLOGY | \
+ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
+
+/*
+ * Config space "registers"
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ struct {
+ uint16_t cylinders;
+ uint8_t heads;
+ uint8_t sectors;
+ } vbc_geometry;
+ uint32_t vbc_blk_size;
+ struct {
+ uint8_t physical_block_exp;
+ uint8_t alignment_offset;
+ uint16_t min_io_size;
+ uint32_t opt_io_size;
+ } vbc_topology;
+ uint8_t vbc_writeback;
+} __packed;
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+#define VBH_OP_FLUSH 4
+#define VBH_OP_FLUSH_OUT 5
+#define VBH_OP_IDENT 8
+#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+struct pci_vtblk_ioreq {
+ struct blockif_req io_req;
+ struct pci_vtblk_softc *io_sc;
+ uint8_t *io_status;
+ uint16_t io_idx;
+};
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct virtio_softc vbsc_vs;
+ pthread_mutex_t vsc_mtx;
+ struct vqueue_info vbsc_vq;
+ struct vtblk_config vbsc_cfg;
+ struct blockif_ctxt *bc;
+#ifndef __FreeBSD__
+ int vbsc_wce;
+#endif
+ char vbsc_ident[VTBLK_BLK_ID_BYTES];
+ struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
+};
+
+static void pci_vtblk_reset(void *);
+static void pci_vtblk_notify(void *, struct vqueue_info *);
+static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
+static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+#ifndef __FreeBSD__
+static void pci_vtblk_apply_feats(void *, uint64_t);
+#endif
+
+static struct virtio_consts vtblk_vi_consts = {
+ "vtblk", /* our name */
+ 1, /* we support 1 virtqueue */
+ sizeof(struct vtblk_config), /* config reg size */
+ pci_vtblk_reset, /* reset */
+ pci_vtblk_notify, /* device-wide qnotify */
+ pci_vtblk_cfgread, /* read PCI config */
+ pci_vtblk_cfgwrite, /* write PCI config */
+#ifndef __FreeBSD__
+ pci_vtblk_apply_feats, /* apply negotiated features */
+#else
+ NULL, /* apply negotiated features */
+#endif
+ VTBLK_S_HOSTCAPS, /* our capabilities */
+};
+
+static void
+pci_vtblk_reset(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device reset requested !\n"));
+ vi_reset_dev(&sc->vbsc_vs);
+#ifndef __FreeBSD__
+ /* Disable write cache until FLUSH feature is negotiated */
+ (void) blockif_set_wce(sc->bc, 0);
+ sc->vbsc_wce = 0;
+#endif
+}
+
+static void
+pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
+{
+ struct pci_vtblk_softc *sc = io->io_sc;
+
+ /* convert errno into a virtio block error return */
+ if (err == EOPNOTSUPP || err == ENOSYS)
+ *io->io_status = VTBLK_S_UNSUPP;
+ else if (err != 0)
+ *io->io_status = VTBLK_S_IOERR;
+ else
+ *io->io_status = VTBLK_S_OK;
+
+ /*
+ * Return the descriptor back to the host.
+ * We wrote 1 byte (our status) to host.
+ */
+ vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
+ vq_endchains(&sc->vbsc_vq, 0);
+}
+
+static void
+pci_vtblk_done(struct blockif_req *br, int err)
+{
+ struct pci_vtblk_ioreq *io = br->br_param;
+ struct pci_vtblk_softc *sc = io->io_sc;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtblk_done_locked(io, err);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
+{
+ struct virtio_blk_hdr *vbh;
+ struct pci_vtblk_ioreq *io;
+ int i, n;
+ int err;
+ ssize_t iolen;
+ int writeop, type;
+ struct iovec iov[BLOCKIF_IOV_MAX + 2];
+ uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
+
+ n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
+
+ /*
+ * The first descriptor will be the read-only fixed header,
+ * and the last is for status (hence +2 above and below).
+ * The remaining iov's are the actual data I/O vectors.
+ *
+ * XXX - note - this fails on crash dump, which does a
+ * VIRTIO_BLK_T_FLUSH with a zero transfer length
+ */
+ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
+
+ io = &sc->vbsc_ios[idx];
+ assert((flags[0] & VRING_DESC_F_WRITE) == 0);
+ assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+ vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
+ memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
+ io->io_req.br_iovcnt = n - 2;
+ io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
+ io->io_status = (uint8_t *)iov[--n].iov_base;
+ assert(iov[n].iov_len == 1);
+ assert(flags[n] & VRING_DESC_F_WRITE);
+
+ /*
+ * XXX
+ * The guest should not be setting the BARRIER flag because
+ * we don't advertise the capability.
+ */
+ type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
+ writeop = (type == VBH_OP_WRITE);
+
+ iolen = 0;
+ for (i = 1; i < n; i++) {
+ /*
+ * - write op implies read-only descriptor,
+ * - read/ident op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
+ iolen += iov[i].iov_len;
+ }
+ io->io_req.br_resid = iolen;
+
+ DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read/ident", iolen, i - 1,
+ io->io_req.br_offset));
+
+ switch (type) {
+ case VBH_OP_READ:
+ err = blockif_read(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_WRITE:
+ err = blockif_write(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_FLUSH:
+ case VBH_OP_FLUSH_OUT:
+ err = blockif_flush(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_IDENT:
+ /* Assume a single buffer */
+ /* S/n equal to buffer is not zero-terminated. */
+ memset(iov[1].iov_base, 0, iov[1].iov_len);
+ strncpy(iov[1].iov_base, sc->vbsc_ident,
+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+ pci_vtblk_done_locked(io, 0);
+ return;
+ default:
+ pci_vtblk_done_locked(io, EOPNOTSUPP);
+ return;
+ }
+ assert(err == 0);
+}
+
+static void
+pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ while (vq_has_descs(vq))
+ pci_vtblk_proc(sc, vq);
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ char bident[sizeof("XX:X:X")];
+ struct blockif_ctxt *bctxt;
+ MD5_CTX mdctx;
+ u_char digest[16];
+ struct pci_vtblk_softc *sc;
+ off_t size;
+ int i, sectsz, sts, sto;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * The supplied backing file has to exist
+ */
+ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
+ bctxt = blockif_open(opts, bident);
+ if (bctxt == NULL) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ size = blockif_size(bctxt);
+ sectsz = blockif_sectsz(bctxt);
+ blockif_psectsz(bctxt, &sts, &sto);
+
+ sc = calloc(1, sizeof(struct pci_vtblk_softc));
+ sc->bc = bctxt;
+ for (i = 0; i < VTBLK_RINGSZ; i++) {
+ struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
+ io->io_req.br_callback = pci_vtblk_done;
+ io->io_req.br_param = io;
+ io->io_sc = sc;
+ io->io_idx = i;
+ }
+
+#ifndef __FreeBSD__
+ /* Disable write cache until FLUSH feature is negotiated */
+ (void) blockif_set_wce(sc->bc, 0);
+ sc->vbsc_wce = 0;
+#endif
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /* init virtio softc and virtqueues */
+ vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+ sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+ /* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
+ /*
+ * Create an identifier for the backing file. Use parts of the
+ * md5 sum of the filename
+ */
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, opts, strlen(opts));
+ MD5Final(digest, &mdctx);
+ snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+#ifdef __FreeBSD__
+ sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX;
+#else
+ /*
+ * If Linux is presented with a seg_max greater than the virtio queue
+ * size, it can stumble into situations where it violates its own
+ * invariants and panics. For safety, we keep seg_max clamped, paying
+ * heed to the two extra descriptors needed for the header and status
+ * of a request.
+ */
+ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
+#endif
+ sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geometry.heads = 0;
+ sc->vbsc_cfg.vbc_geometry.sectors = 0;
+ sc->vbsc_cfg.vbc_blk_size = sectsz;
+ sc->vbsc_cfg.vbc_topology.physical_block_exp =
+ (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
+ sc->vbsc_cfg.vbc_topology.alignment_offset =
+ (sto != 0) ? ((sts - sto) / sectsz) : 0;
+ sc->vbsc_cfg.vbc_topology.min_io_size = 0;
+ sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
+ sc->vbsc_cfg.vbc_writeback = 0;
+
+ /*
+ * Should we move some of this into virtio.c? Could
+ * have the device, class, and subdev_0 as fields in
+ * the virtio constants structure.
+ */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
+ blockif_close(sc->bc);
+ free(sc);
+ return (1);
+ }
+ vi_set_io_bar(&sc->vbsc_vs, 0);
+ return (0);
+}
+
+static int
+pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+
+ DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+ return (1);
+}
+
+static int
+pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtblk_softc *sc = vsc;
+ void *ptr;
+
+ /* our caller has already verified offset and size */
+ ptr = (uint8_t *)&sc->vbsc_cfg + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+#ifndef __FreeBSD__
+void
+pci_vtblk_apply_feats(void *vsc, uint64_t caps)
+{
+ struct pci_vtblk_softc *sc = vsc;
+ const int wce_next = ((caps & VTBLK_F_FLUSH) != 0) ? 1 : 0;
+
+ if (sc->vbsc_wce != wce_next) {
+ (void) blockif_set_wce(sc->bc, wce_next);
+ sc->vbsc_wce = wce_next;
+ }
+}
+#endif /* __FreeBSD__ */
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c
new file mode 100644
index 0000000000..90437662df
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_console.c
@@ -0,0 +1,701 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 iXsystems Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Jakub Klama <jceel@FreeBSD.org>
+ * under sponsorship from iXsystems Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/linker_set.h>
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <libgen.h>
+#include <sysexits.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+#include "mevent.h"
+#include "sockstream.h"
+
+#define VTCON_RINGSZ 64
+#define VTCON_MAXPORTS 16
+#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2)
+
+#define VTCON_DEVICE_READY 0
+#define VTCON_DEVICE_ADD 1
+#define VTCON_DEVICE_REMOVE 2
+#define VTCON_PORT_READY 3
+#define VTCON_CONSOLE_PORT 4
+#define VTCON_CONSOLE_RESIZE 5
+#define VTCON_PORT_OPEN 6
+#define VTCON_PORT_NAME 7
+
+#define VTCON_F_SIZE 0
+#define VTCON_F_MULTIPORT 1
+#define VTCON_F_EMERG_WRITE 2
+#define VTCON_S_HOSTCAPS \
+ (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE)
+
+static int pci_vtcon_debug;
+#define DPRINTF(params) if (pci_vtcon_debug) printf params
+#define WPRINTF(params) printf params
+
+struct pci_vtcon_softc;
+struct pci_vtcon_port;
+struct pci_vtcon_config;
+typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *,
+ int);
+
+struct pci_vtcon_port {
+ struct pci_vtcon_softc * vsp_sc;
+ int vsp_id;
+ const char * vsp_name;
+ bool vsp_enabled;
+ bool vsp_console;
+ bool vsp_rx_ready;
+ bool vsp_open;
+ int vsp_rxq;
+ int vsp_txq;
+ void * vsp_arg;
+ pci_vtcon_cb_t * vsp_cb;
+};
+
+struct pci_vtcon_sock
+{
+ struct pci_vtcon_port * vss_port;
+ const char * vss_path;
+ struct mevent * vss_server_evp;
+ struct mevent * vss_conn_evp;
+ int vss_server_fd;
+ int vss_conn_fd;
+ bool vss_open;
+};
+
+struct pci_vtcon_softc {
+ struct virtio_softc vsc_vs;
+ struct vqueue_info vsc_queues[VTCON_MAXQ];
+ pthread_mutex_t vsc_mtx;
+ uint64_t vsc_cfg;
+ uint64_t vsc_features;
+ char * vsc_rootdir;
+ int vsc_kq;
+ int vsc_nports;
+ bool vsc_ready;
+ struct pci_vtcon_port vsc_control_port;
+ struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS];
+ struct pci_vtcon_config *vsc_config;
+};
+
+struct pci_vtcon_config {
+ uint16_t cols;
+ uint16_t rows;
+ uint32_t max_nr_ports;
+ uint32_t emerg_wr;
+} __attribute__((packed));
+
+struct pci_vtcon_control {
+ uint32_t id;
+ uint16_t event;
+ uint16_t value;
+} __attribute__((packed));
+
+struct pci_vtcon_console_resize {
+ uint16_t cols;
+ uint16_t rows;
+} __attribute__((packed));
+
+static void pci_vtcon_reset(void *);
+static void pci_vtcon_notify_rx(void *, struct vqueue_info *);
+static void pci_vtcon_notify_tx(void *, struct vqueue_info *);
+static int pci_vtcon_cfgread(void *, int, int, uint32_t *);
+static int pci_vtcon_cfgwrite(void *, int, int, uint32_t);
+static void pci_vtcon_neg_features(void *, uint64_t);
+static void pci_vtcon_sock_accept(int, enum ev_type, void *);
+static void pci_vtcon_sock_rx(int, enum ev_type, void *);
+static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *,
+ int);
+static void pci_vtcon_control_send(struct pci_vtcon_softc *,
+ struct pci_vtcon_control *, const void *, size_t);
+static void pci_vtcon_announce_port(struct pci_vtcon_port *);
+static void pci_vtcon_open_port(struct pci_vtcon_port *, bool);
+
+static struct virtio_consts vtcon_vi_consts = {
+ "vtcon", /* our name */
+ VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */
+ sizeof(struct pci_vtcon_config), /* config reg size */
+ pci_vtcon_reset, /* reset */
+ NULL, /* device-wide qnotify */
+ pci_vtcon_cfgread, /* read virtio config */
+ pci_vtcon_cfgwrite, /* write virtio config */
+ pci_vtcon_neg_features, /* apply negotiated features */
+ VTCON_S_HOSTCAPS, /* our capabilities */
+};
+
+
+static void
+pci_vtcon_reset(void *vsc)
+{
+ struct pci_vtcon_softc *sc;
+
+ sc = vsc;
+
+ DPRINTF(("vtcon: device reset requested!\n"));
+ vi_reset_dev(&sc->vsc_vs);
+}
+
+static void
+pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features)
+{
+ struct pci_vtcon_softc *sc = vsc;
+
+ sc->vsc_features = negotiated_features;
+}
+
+static int
+pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtcon_softc *sc = vsc;
+ void *ptr;
+
+ ptr = (uint8_t *)sc->vsc_config + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+static int
+pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val)
+{
+
+ return (0);
+}
+
+static inline struct pci_vtcon_port *
+pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq)
+{
+ uint16_t num = vq->vq_num;
+
+ if (num == 0 || num == 1)
+ return (&sc->vsc_ports[0]);
+
+ if (num == 2 || num == 3)
+ return (&sc->vsc_control_port);
+
+ return (&sc->vsc_ports[(num / 2) - 1]);
+}
+
+static inline struct vqueue_info *
+pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue)
+{
+ int qnum;
+
+ qnum = tx_queue ? port->vsp_txq : port->vsp_rxq;
+ return (&port->vsp_sc->vsc_queues[qnum]);
+}
+
+static struct pci_vtcon_port *
+pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name,
+ pci_vtcon_cb_t *cb, void *arg)
+{
+ struct pci_vtcon_port *port;
+
+ if (sc->vsc_nports == VTCON_MAXPORTS) {
+ errno = EBUSY;
+ return (NULL);
+ }
+
+ port = &sc->vsc_ports[sc->vsc_nports++];
+ port->vsp_id = sc->vsc_nports - 1;
+ port->vsp_sc = sc;
+ port->vsp_name = name;
+ port->vsp_cb = cb;
+ port->vsp_arg = arg;
+
+ if (port->vsp_id == 0) {
+ /* port0 */
+ port->vsp_txq = 0;
+ port->vsp_rxq = 1;
+ } else {
+ port->vsp_txq = sc->vsc_nports * 2;
+ port->vsp_rxq = port->vsp_txq + 1;
+ }
+
+ port->vsp_enabled = true;
+ return (port);
+}
+
+static int
+pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name,
+ const char *path)
+{
+ struct pci_vtcon_sock *sock;
+#ifdef __FreeBSD__
+ struct sockaddr_un sun;
+ char *pathcopy;
+#else
+ /* Our compiler #defines 'sun' as '1'. Awesome. */
+ struct sockaddr_un addr;
+#endif
+ int s = -1, fd = -1, error = 0;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ sock = calloc(1, sizeof(struct pci_vtcon_sock));
+ if (sock == NULL) {
+ error = -1;
+ goto out;
+ }
+
+ s = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (s < 0) {
+ error = -1;
+ goto out;
+ }
+
+#ifdef __FreeBSD__
+ pathcopy = strdup(path);
+ if (pathcopy == NULL) {
+ error = -1;
+ goto out;
+ }
+
+ fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY);
+ if (fd < 0) {
+ free(pathcopy);
+ error = -1;
+ goto out;
+ }
+
+ sun.sun_family = AF_UNIX;
+ sun.sun_len = sizeof(struct sockaddr_un);
+ strcpy(pathcopy, path);
+ strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path));
+ free(pathcopy);
+
+ if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) {
+ error = -1;
+ goto out;
+ }
+#else /* __FreeBSD__ */
+ /* Do a simple bind rather than the FreeBSD bindat() */
+ addr.sun_family = AF_UNIX;
+ (void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path));
+ if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) {
+ error = -1;
+ goto out;
+ }
+#endif /* __FreeBSD__ */
+
+ if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) {
+ error = -1;
+ goto out;
+ }
+
+ if (listen(s, 1) < 0) {
+ error = -1;
+ goto out;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(s, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock);
+ if (sock->vss_port == NULL) {
+ error = -1;
+ goto out;
+ }
+
+ sock->vss_open = false;
+ sock->vss_conn_fd = -1;
+ sock->vss_server_fd = s;
+ sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept,
+ sock);
+
+ if (sock->vss_server_evp == NULL) {
+ error = -1;
+ goto out;
+ }
+
+out:
+ if (fd != -1)
+ close(fd);
+
+ if (error != 0 && s != -1)
+ close(s);
+
+ return (error);
+}
+
+static void
+pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg)
+{
+ struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
+ int s;
+
+ s = accept(sock->vss_server_fd, NULL, NULL);
+ if (s < 0)
+ return;
+
+ if (sock->vss_open) {
+ close(s);
+ return;
+ }
+
+ sock->vss_open = true;
+ sock->vss_conn_fd = s;
+ sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock);
+
+ pci_vtcon_open_port(sock->vss_port, true);
+}
+
+static void
+pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg)
+{
+ struct pci_vtcon_port *port;
+ struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
+ struct vqueue_info *vq;
+ struct iovec iov;
+ static char dummybuf[2048];
+ int len, n;
+ uint16_t idx;
+
+ port = sock->vss_port;
+ vq = pci_vtcon_port_to_vq(port, true);
+
+ if (!sock->vss_open || !port->vsp_rx_ready) {
+ len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
+ if (len == 0)
+ goto close;
+
+ return;
+ }
+
+ if (!vq_has_descs(vq)) {
+ len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
+ vq_endchains(vq, 1);
+ if (len == 0)
+ goto close;
+
+ return;
+ }
+
+ do {
+ n = vq_getchain(vq, &idx, &iov, 1, NULL);
+ len = readv(sock->vss_conn_fd, &iov, n);
+
+ if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) {
+ vq_retchain(vq);
+ vq_endchains(vq, 0);
+ if (len == 0)
+ goto close;
+
+ return;
+ }
+
+ vq_relchain(vq, idx, len);
+ } while (vq_has_descs(vq));
+
+ vq_endchains(vq, 1);
+
+close:
+ mevent_delete_close(sock->vss_conn_evp);
+ sock->vss_conn_fd = -1;
+ sock->vss_open = false;
+}
+
+static void
+pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
+ int niov)
+{
+ struct pci_vtcon_sock *sock;
+#ifdef __FreeBSD__
+ int i, ret;
+#else
+ int i, ret = 0;
+#endif
+
+ sock = (struct pci_vtcon_sock *)arg;
+
+ if (sock->vss_conn_fd == -1)
+ return;
+
+ for (i = 0; i < niov; i++) {
+ ret = stream_write(sock->vss_conn_fd, iov[i].iov_base,
+ iov[i].iov_len);
+ if (ret <= 0)
+ break;
+ }
+
+ if (ret <= 0) {
+ mevent_delete_close(sock->vss_conn_evp);
+ sock->vss_conn_fd = -1;
+ sock->vss_open = false;
+ }
+}
+
+static void
+pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
+ int niov)
+{
+ struct pci_vtcon_softc *sc;
+ struct pci_vtcon_port *tmp;
+ struct pci_vtcon_control resp, *ctrl;
+ int i;
+
+ assert(niov == 1);
+
+ sc = port->vsp_sc;
+ ctrl = (struct pci_vtcon_control *)iov->iov_base;
+
+ switch (ctrl->event) {
+ case VTCON_DEVICE_READY:
+ sc->vsc_ready = true;
+ /* set port ready events for registered ports */
+ for (i = 0; i < VTCON_MAXPORTS; i++) {
+ tmp = &sc->vsc_ports[i];
+ if (tmp->vsp_enabled)
+ pci_vtcon_announce_port(tmp);
+
+ if (tmp->vsp_open)
+ pci_vtcon_open_port(tmp, true);
+ }
+ break;
+
+ case VTCON_PORT_READY:
+ if (ctrl->id >= sc->vsc_nports) {
+ WPRINTF(("VTCON_PORT_READY event for unknown port %d\n",
+ ctrl->id));
+ return;
+ }
+
+ tmp = &sc->vsc_ports[ctrl->id];
+ if (tmp->vsp_console) {
+ resp.event = VTCON_CONSOLE_PORT;
+ resp.id = ctrl->id;
+ resp.value = 1;
+ pci_vtcon_control_send(sc, &resp, NULL, 0);
+ }
+ break;
+ }
+}
+
+static void
+pci_vtcon_announce_port(struct pci_vtcon_port *port)
+{
+ struct pci_vtcon_control event;
+
+ event.id = port->vsp_id;
+ event.event = VTCON_DEVICE_ADD;
+ event.value = 1;
+ pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
+
+ event.event = VTCON_PORT_NAME;
+ pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name,
+ strlen(port->vsp_name));
+}
+
+static void
+pci_vtcon_open_port(struct pci_vtcon_port *port, bool open)
+{
+ struct pci_vtcon_control event;
+
+ if (!port->vsp_sc->vsc_ready) {
+ port->vsp_open = true;
+ return;
+ }
+
+ event.id = port->vsp_id;
+ event.event = VTCON_PORT_OPEN;
+ event.value = (int)open;
+ pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
+}
+
+static void
+pci_vtcon_control_send(struct pci_vtcon_softc *sc,
+ struct pci_vtcon_control *ctrl, const void *payload, size_t len)
+{
+ struct vqueue_info *vq;
+ struct iovec iov;
+ uint16_t idx;
+ int n;
+
+ vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true);
+
+ if (!vq_has_descs(vq))
+ return;
+
+ n = vq_getchain(vq, &idx, &iov, 1, NULL);
+
+ assert(n == 1);
+
+ memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control));
+ if (payload != NULL && len > 0)
+ memcpy(iov.iov_base + sizeof(struct pci_vtcon_control),
+ payload, len);
+
+ vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len);
+ vq_endchains(vq, 1);
+}
+
+
+static void
+pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtcon_softc *sc;
+ struct pci_vtcon_port *port;
+ struct iovec iov[1];
+ uint16_t idx, n;
+ uint16_t flags[8];
+
+ sc = vsc;
+ port = pci_vtcon_vq_to_port(sc, vq);
+
+ while (vq_has_descs(vq)) {
+ n = vq_getchain(vq, &idx, iov, 1, flags);
+ assert(n >= 1);
+ if (port != NULL)
+ port->vsp_cb(port, port->vsp_arg, iov, 1);
+
+ /*
+ * Release this chain and handle more
+ */
+ vq_relchain(vq, idx, 0);
+ }
+ vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
+}
+
+static void
+pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtcon_softc *sc;
+ struct pci_vtcon_port *port;
+
+ sc = vsc;
+ port = pci_vtcon_vq_to_port(sc, vq);
+
+ if (!port->vsp_rx_ready) {
+ port->vsp_rx_ready = 1;
+ vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+ }
+}
+
+static int
+pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_vtcon_softc *sc;
+ char *portname = NULL;
+ char *portpath = NULL;
+ char *opt;
+ int i;
+
+ sc = calloc(1, sizeof(struct pci_vtcon_softc));
+ sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config));
+ sc->vsc_config->max_nr_ports = VTCON_MAXPORTS;
+ sc->vsc_config->cols = 80;
+ sc->vsc_config->rows = 25;
+
+ vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ for (i = 0; i < VTCON_MAXQ; i++) {
+ sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ;
+ sc->vsc_queues[i].vq_notify = i % 2 == 0
+ ? pci_vtcon_notify_rx
+ : pci_vtcon_notify_tx;
+ }
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
+ return (1);
+ vi_set_io_bar(&sc->vsc_vs, 0);
+
+ /* create control port */
+ sc->vsc_control_port.vsp_sc = sc;
+ sc->vsc_control_port.vsp_txq = 2;
+ sc->vsc_control_port.vsp_rxq = 3;
+ sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx;
+ sc->vsc_control_port.vsp_enabled = true;
+
+ while ((opt = strsep(&opts, ",")) != NULL) {
+ portname = strsep(&opt, "=");
+ portpath = opt;
+
+ /* create port */
+ if (pci_vtcon_sock_add(sc, portname, portpath) < 0) {
+ fprintf(stderr, "cannot create port %s: %s\n",
+ portname, strerror(errno));
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+struct pci_devemu pci_de_vcon = {
+ .pe_emu = "virtio-console",
+ .pe_init = pci_vtcon_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vcon);
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000000..74efbcaee1
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -0,0 +1,1169 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <machine/atomic.h>
+#include <net/ethernet.h>
+#ifdef __FreeBSD__
+#ifndef NETMAP_WITH_LIBS
+#define NETMAP_WITH_LIBS
+#endif
+#include <net/netmap_user.h>
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+#ifndef __FreeBSD__
+#include <poll.h>
+#include <libdlpi.h>
+#endif
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "virtio.h"
+
+#define VTNET_RINGSZ 1024
+
+#define VTNET_MAXSEGS 256
+
+/*
+ * Host capabilities. Note that we only offer a few of these.
+ */
+#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
+#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
+#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
+#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
+#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
+#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
+#define VIRTIO_NET_F_GUEST_ANNOUNCE \
+ (1 << 21) /* guest can send gratuitous pkts */
+
+#define VTNET_S_HOSTCAPS \
+ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
+ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
+
+/*
+ * PCI config-space "registers"
+ */
+struct virtio_net_config {
+ uint8_t mac[6];
+ uint16_t status;
+} __packed;
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2 /* NB: not yet supported */
+
+#define VTNET_MAXQ 3
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct virtio_softc vsc_vs;
+ struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+#ifdef __FreeBSD
+ int vsc_tapfd;
+#else
+ dlpi_handle_t vsc_dhp;
+ int vsc_dlpifd;
+#endif
+ struct nm_desc *vsc_nmd;
+
+ int vsc_rx_ready;
+ volatile int resetting; /* set and checked outside lock */
+
+ uint64_t vsc_features; /* negotiated features */
+
+ struct virtio_net_config vsc_config;
+
+ pthread_mutex_t rx_mtx;
+ int rx_in_progress;
+ int rx_vhdrlen;
+ int rx_merge; /* merged rx bufs in use */
+
+ pthread_t tx_tid;
+ pthread_mutex_t tx_mtx;
+ pthread_cond_t tx_cond;
+ int tx_in_progress;
+
+ void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
+ void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
+ int iovcnt, int len);
+};
+
+static void pci_vtnet_reset(void *);
+/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
+static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
+static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
+static void pci_vtnet_neg_features(void *, uint64_t);
+
+static struct virtio_consts vtnet_vi_consts = {
+ "vtnet", /* our name */
+ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */
+ sizeof(struct virtio_net_config), /* config reg size */
+ pci_vtnet_reset, /* reset */
+ NULL, /* device-wide qnotify -- not used */
+ pci_vtnet_cfgread, /* read PCI config */
+ pci_vtnet_cfgwrite, /* write PCI config */
+ pci_vtnet_neg_features, /* apply negotiated features */
+ VTNET_S_HOSTCAPS, /* our capabilities */
+};
+
+/*
+ * If the transmit thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_txwait(struct pci_vtnet_softc *sc)
+{
+
+ pthread_mutex_lock(&sc->tx_mtx);
+ while (sc->tx_in_progress) {
+ pthread_mutex_unlock(&sc->tx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * If the receive thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
+{
+
+ pthread_mutex_lock(&sc->rx_mtx);
+ while (sc->rx_in_progress) {
+ pthread_mutex_unlock(&sc->rx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->rx_mtx);
+ }
+ pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static void
+pci_vtnet_reset(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device reset requested !\n"));
+
+ sc->resetting = 1;
+
+ /*
+ * Wait for the transmit and receive threads to finish their
+ * processing.
+ */
+ pci_vtnet_txwait(sc);
+ pci_vtnet_rxwait(sc);
+
+ sc->vsc_rx_ready = 0;
+ sc->rx_merge = 1;
+ sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
+
+ /* now reset rings, MSI-X vectors, and negotiated capabilities */
+ vi_reset_dev(&sc->vsc_vs);
+
+ sc->resetting = 0;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+#ifdef __FreeBSD__
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ static char pad[60]; /* all zero bytes */
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+#else
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ int i;
+
+ for (i = 0; i < iovcnt; i++) {
+ (void) dlpi_send(sc->vsc_dhp, NULL, NULL,
+ iov[i].iov_base, iov[i].iov_len, NULL);
+ }
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __FreeBSD__
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+#endif /* __FreeBSD__ */
+
+static __inline struct iovec *
+rx_iov_trim(struct iovec *iov, int *niov, int tlen)
+{
+ struct iovec *riov;
+
+ /* XXX short-cut: assume first segment is >= tlen */
+ assert(iov[0].iov_len >= tlen);
+
+ iov[0].iov_len -= tlen;
+ if (iov[0].iov_len == 0) {
+ assert(*niov > 1);
+ *niov -= 1;
+ riov = &iov[1];
+ } else {
+ iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+ riov = &iov[0];
+ }
+
+ return (riov);
+}
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct iovec iov[VTNET_MAXSEGS], *riov;
+ struct vqueue_info *vq;
+ void *vrx;
+ int n;
+#ifdef __FreeBSD__
+ int len;
+#else
+ size_t len;
+ int ret;
+#endif
+ uint16_t idx;
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+#ifdef __FreeBSD__
+ assert(sc->vsc_tapfd != -1);
+#else
+ assert(sc->vsc_dlpifd != -1);
+#endif
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up or the guest is resetting the device.
+ */
+ if (!sc->vsc_rx_ready || sc->resetting) {
+#ifdef __FreeBSD__
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+ return;
+ }
+
+ /*
+ * Check for available rx buffers
+ */
+ vq = &sc->vsc_queues[VTNET_RXQ];
+ if (!vq_has_descs(vq)) {
+ /*
+ * Drop the packet and try later. Interrupt on
+ * empty, if that's negotiated.
+ */
+#ifdef __FreeBSD__
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+ vq_endchains(vq, 1);
+ return;
+ }
+
+ do {
+ /*
+ * Get descriptor chain
+ */
+ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
+ assert(n >= 1 && n <= VTNET_MAXSEGS);
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = iov[0].iov_base;
+ riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+#ifdef __FreeBSD__
+ len = readv(sc->vsc_tapfd, riov, n);
+#else
+ len = riov[0].iov_len;
+ ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
+ (uint8_t *)riov[0].iov_base, &len, 0, NULL);
+ if (ret != DLPI_SUCCESS) {
+ errno = EWOULDBLOCK;
+ len = 0;
+ }
+#endif
+ if (len <= 0 && errno == EWOULDBLOCK) {
+ /*
+ * No more packets, but still some avail ring
+ * entries. Interrupt if needed/appropriate.
+ */
+ vq_retchain(vq);
+ vq_endchains(vq, 0);
+ return;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers if merged rx bufs were negotiated.
+ */
+ memset(vrx, 0, sc->rx_vhdrlen);
+
+ if (sc->rx_merge) {
+ struct virtio_net_rxhdr *vrxh;
+
+ vrxh = vrx;
+ vrxh->vrh_bufs = 1;
+ }
+
+ /*
+ * Release this chain and handle more chains.
+ */
+ vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+ } while (vq_has_descs(vq));
+
+ /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+ vq_endchains(vq, 1);
+}
+
+#ifdef __FreeBSD__
+static __inline int
+pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+ int r, i;
+ int len = 0;
+
+ for (r = nmd->cur_tx_ring; ; ) {
+ struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
+ uint32_t cur, idx;
+ char *buf;
+
+ if (nm_ring_empty(ring)) {
+ r++;
+ if (r > nmd->last_tx_ring)
+ r = nmd->first_tx_ring;
+ if (r == nmd->cur_tx_ring)
+ break;
+ continue;
+ }
+ cur = ring->cur;
+ idx = ring->slot[cur].buf_idx;
+ buf = NETMAP_BUF(ring, idx);
+
+ for (i = 0; i < iovcnt; i++) {
+ if (len + iov[i].iov_len > 2048)
+ break;
+ memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
+ len += iov[i].iov_len;
+ }
+ ring->slot[cur].len = len;
+ ring->head = ring->cur = nm_ring_next(ring, cur);
+ nmd->cur_tx_ring = r;
+ ioctl(nmd->fd, NIOCTXSYNC, NULL);
+ break;
+ }
+
+ return (len);
+}
+
+static __inline int
+pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+ int len = 0;
+ int i = 0;
+ int r;
+
+ for (r = nmd->cur_rx_ring; ; ) {
+ struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
+ uint32_t cur, idx;
+ char *buf;
+ size_t left;
+
+ if (nm_ring_empty(ring)) {
+ r++;
+ if (r > nmd->last_rx_ring)
+ r = nmd->first_rx_ring;
+ if (r == nmd->cur_rx_ring)
+ break;
+ continue;
+ }
+ cur = ring->cur;
+ idx = ring->slot[cur].buf_idx;
+ buf = NETMAP_BUF(ring, idx);
+ left = ring->slot[cur].len;
+
+ for (i = 0; i < iovcnt && left > 0; i++) {
+ if (iov[i].iov_len > left)
+ iov[i].iov_len = left;
+ memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
+ len += iov[i].iov_len;
+ left -= iov[i].iov_len;
+ }
+ ring->head = ring->cur = nm_ring_next(ring, cur);
+ nmd->cur_rx_ring = r;
+ ioctl(nmd->fd, NIOCRXSYNC, NULL);
+ break;
+ }
+ for (; i < iovcnt; i++)
+ iov[i].iov_len = 0;
+
+ return (len);
+}
+
+/*
+ * Called to send a buffer chain out to the vale port
+ */
+static void
+pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ static char pad[60]; /* all zero bytes */
+
+ if (sc->vsc_nmd == NULL)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
+}
+
+static void
+pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
+{
+ struct iovec iov[VTNET_MAXSEGS], *riov;
+ struct vqueue_info *vq;
+ void *vrx;
+ int len, n;
+ uint16_t idx;
+
+ /*
+ * Should never be called without a valid netmap descriptor
+ */
+ assert(sc->vsc_nmd != NULL);
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up or the guest is resetting the device.
+ */
+ if (!sc->vsc_rx_ready || sc->resetting) {
+ /*
+ * Drop the packet and try later.
+ */
+ (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+ return;
+ }
+
+ /*
+ * Check for available rx buffers
+ */
+ vq = &sc->vsc_queues[VTNET_RXQ];
+ if (!vq_has_descs(vq)) {
+ /*
+ * Drop the packet and try later. Interrupt on
+ * empty, if that's negotiated.
+ */
+ (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+ vq_endchains(vq, 1);
+ return;
+ }
+
+ do {
+ /*
+ * Get descriptor chain.
+ */
+ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
+ assert(n >= 1 && n <= VTNET_MAXSEGS);
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = iov[0].iov_base;
+ riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+
+ len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
+
+ if (len == 0) {
+ /*
+ * No more packets, but still some avail ring
+ * entries. Interrupt if needed/appropriate.
+ */
+ vq_retchain(vq);
+ vq_endchains(vq, 0);
+ return;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers if merged rx bufs were negotiated.
+ */
+ memset(vrx, 0, sc->rx_vhdrlen);
+
+ if (sc->rx_merge) {
+ struct virtio_net_rxhdr *vrxh;
+
+ vrxh = vrx;
+ vrxh->vrh_bufs = 1;
+ }
+
+ /*
+ * Release this chain and handle more chains.
+ */
+ vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+ } while (vq_has_descs(vq));
+
+ /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+ vq_endchains(vq, 1);
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __FreeBSD__
+static void
+pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->rx_mtx);
+ sc->rx_in_progress = 1;
+ sc->pci_vtnet_rx(sc);
+ sc->rx_in_progress = 0;
+ pthread_mutex_unlock(&sc->rx_mtx);
+
+}
+#else
+static void *
+pci_vtnet_poll_thread(void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+ pollfd_t pollset;
+
+ pollset.fd = sc->vsc_dlpifd;
+ pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+
+ for (;;) {
+ if (poll(&pollset, 1, -1) < 0) {
+ if (errno == EINTR)
+ continue;
+ fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
+ continue;
+ }
+ pthread_mutex_lock(&sc->vsc_mtx);
+ sc->rx_in_progress = 1;
+ pci_vtnet_tap_rx(sc);
+ sc->rx_in_progress = 0;
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ }
+
+ return (NULL);
+}
+#endif /* __FreeBSD__ */
+
+static void
+pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ int i, n;
+ int plen, tlen;
+ uint16_t idx;
+
+ /*
+ * Obtain chain of descriptors. The first one is
+ * really the header descriptor, so we need to sum
+ * up two lengths: packet length and transfer length.
+ */
+ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
+ assert(n >= 1 && n <= VTNET_MAXSEGS);
+ plen = 0;
+ tlen = iov[0].iov_len;
+ for (i = 1; i < n; i++) {
+ plen += iov[i].iov_len;
+ tlen += iov[i].iov_len;
+ }
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
+ sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
+
+ /* chain is processed, release it and set tlen */
+ vq_relchain(vq, idx, tlen);
+}
+
+static void
+pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ /*
+ * Any ring entries to process?
+ */
+ if (!vq_has_descs(vq))
+ return;
+
+ /* Signal the tx thread for processing */
+ pthread_mutex_lock(&sc->tx_mtx);
+ vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+ if (sc->tx_in_progress == 0)
+ pthread_cond_signal(&sc->tx_cond);
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * Thread which will handle processing of TX desc
+ */
+static void *
+pci_vtnet_tx_thread(void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+ struct vqueue_info *vq;
+ int error;
+
+ vq = &sc->vsc_queues[VTNET_TXQ];
+
+ /*
+ * Let us wait till the tx queue pointers get initialised &
+ * first tx signaled
+ */
+ pthread_mutex_lock(&sc->tx_mtx);
+ error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+ assert(error == 0);
+
+ for (;;) {
+ /* note - tx mutex is locked here */
+ while (sc->resetting || !vq_has_descs(vq)) {
+ vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
+ mb();
+ if (!sc->resetting && vq_has_descs(vq))
+ break;
+
+ sc->tx_in_progress = 0;
+ error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+ assert(error == 0);
+ }
+ vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+ sc->tx_in_progress = 1;
+ pthread_mutex_unlock(&sc->tx_mtx);
+
+ do {
+ /*
+ * Run through entries, placing them into
+ * iovecs and sending when an end-of-packet
+ * is found
+ */
+ pci_vtnet_proctx(sc, vq);
+ } while (vq_has_descs(vq));
+
+ /*
+ * Generate an interrupt if needed.
+ */
+ vq_endchains(vq, 1);
+
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+ return (NULL);
+}
+
+#ifdef __FreeBSD__
+static void
+pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __FreeBSD__
+static int
+pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+ struct ether_addr *ea;
+ char *tmpstr;
+ char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+ tmpstr = strsep(&mac_str,"=");
+
+ if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+ ea = ether_aton(mac_str);
+
+ if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+ memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+ fprintf(stderr, "Invalid MAC %s\n", mac_str);
+ return (EINVAL);
+ } else
+ memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+ }
+
+ return (0);
+}
+#endif /* __FreeBSD__ */
+
+static void
+pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
+{
+ char tbuf[80];
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+#ifndef __FreeBSD__
+ uchar_t physaddr[DLPI_PHYSADDR_MAX];
+ size_t physaddrlen = DLPI_PHYSADDR_MAX;
+ int error;
+#endif
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, devname, sizeof(tbuf));
+
+ sc->pci_vtnet_rx = pci_vtnet_tap_rx;
+ sc->pci_vtnet_tx = pci_vtnet_tap_tx;
+#ifdef __FreeBSD__
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ return;
+ }
+
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_rx_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+#else
+ if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
+ WPRINTF(("open of vnic device %s failed\n", devname));
+ }
+
+ if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
+ &physaddrlen) != DLPI_SUCCESS) {
+ WPRINTF(("read MAC address of vnic device %s failed\n",
+ devname));
+ }
+ if (physaddrlen != ETHERADDRL) {
+ WPRINTF(("bad MAC address len %d on vnic device %s\n",
+ physaddrlen, devname));
+ }
+ memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
+
+ if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
+ WPRINTF(("bind of vnic device %s failed\n", devname));
+ }
+
+ if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
+ WPRINTF(("enable promiscous mode(physical) of vnic device %s "
+ "failed\n", devname));
+ }
+ if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
+ WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
+ "failed\n", devname));
+ }
+
+ sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
+
+ if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
+ WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
+ devname));
+ dlpi_close(sc->vsc_dhp);
+ sc->vsc_dlpifd = -1;
+ }
+
+ error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
+ assert(error == 0);
+#endif
+}
+
+#ifdef __FreeBSD__
+static void
+pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
+{
+ sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
+ sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
+
+ sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
+ if (sc->vsc_nmd == NULL) {
+ WPRINTF(("open of netmap device %s failed\n", ifname));
+ return;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
+ EVF_READ,
+ pci_vtnet_rx_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ nm_close(sc->vsc_nmd);
+ sc->vsc_nmd = NULL;
+ }
+}
+#endif /* __FreeBSD__ */
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+#ifdef __FreeBSD__
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+#endif
+ char tname[MAXCOMLEN + 1];
+ struct pci_vtnet_softc *sc;
+ const char *env_msi;
+ char *devname;
+ char *vtopts;
+#ifdef __FreeBSD__
+ int mac_provided;
+#endif
+ int use_msix;
+
+ sc = calloc(1, sizeof(struct pci_vtnet_softc));
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
+ sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
+#ifdef __FreeBSD__
+ sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
+#endif
+
+ /*
+ * Use MSI if set by user
+ */
+ use_msix = 1;
+ if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
+ if (strcasecmp(env_msi, "yes") == 0)
+ use_msix = 0;
+ }
+
+ /*
+ * Attempt to open the tap device and read the MAC address
+ * if specified
+ */
+#ifdef __FreeBSD__
+ mac_provided = 0;
+ sc->vsc_tapfd = -1;
+#endif
+ sc->vsc_nmd = NULL;
+ if (opts != NULL) {
+#ifdef __FreeBSD__
+ int err;
+#endif
+
+ devname = vtopts = strdup(opts);
+ (void) strsep(&vtopts, ",");
+
+#ifdef __FreBSD__
+ if (vtopts != NULL) {
+ err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
+ if (err != 0) {
+ free(devname);
+ return (err);
+ }
+ mac_provided = 1;
+ }
+#endif
+
+#ifdef __FreeBSD__
+ if (strncmp(devname, "vale", 4) == 0)
+ pci_vtnet_netmap_setup(sc, devname);
+#endif
+ if (strncmp(devname, "tap", 3) == 0 ||
+ strncmp(devname, "vmnet", 5) == 0)
+ pci_vtnet_tap_setup(sc, devname);
+
+ free(devname);
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * The default MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the PCI slot/func number and dev name
+ */
+ if (!mac_provided) {
+ snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+ pi->pi_func, vmname);
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_config.mac[0] = 0x00;
+ sc->vsc_config.mac[1] = 0xa0;
+ sc->vsc_config.mac[2] = 0x98;
+ sc->vsc_config.mac[3] = digest[0];
+ sc->vsc_config.mac[4] = digest[1];
+ sc->vsc_config.mac[5] = digest[2];
+ }
+#endif
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ /* Link is up if we managed to open tap device or vale port. */
+#ifdef __FreeBSD__
+ sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
+#else
+ sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 ||
+#endif
+ sc->vsc_nmd != NULL);
+
+ /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
+ if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
+ return (1);
+
+ /* use BAR 0 to map config regs in IO space */
+ vi_set_io_bar(&sc->vsc_vs, 0);
+
+ sc->resetting = 0;
+
+ sc->rx_merge = 1;
+ sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
+ sc->rx_in_progress = 0;
+ pthread_mutex_init(&sc->rx_mtx, NULL);
+
+ /*
+ * Initialize tx semaphore & spawn TX processing thread.
+ * As of now, only one thread for TX desc processing is
+ * spawned.
+ */
+ sc->tx_in_progress = 0;
+ pthread_mutex_init(&sc->tx_mtx, NULL);
+ pthread_cond_init(&sc->tx_cond, NULL);
+ pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
+ snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
+ pi->pi_func);
+ pthread_set_name_np(sc->tx_tid, tname);
+
+ return (0);
+}
+
+static int
+pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+ struct pci_vtnet_softc *sc = vsc;
+ void *ptr;
+
+ if (offset < 6) {
+ assert(offset + size <= 6);
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ ptr = &sc->vsc_config.mac[offset];
+ memcpy(ptr, &value, size);
+ } else {
+ /* silently ignore other writes */
+ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+ }
+
+ return (0);
+}
+
+static int
+pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtnet_softc *sc = vsc;
+ void *ptr;
+
+ ptr = (uint8_t *)&sc->vsc_config + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+static void
+pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ sc->vsc_features = negotiated_features;
+
+ if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
+ sc->rx_merge = 0;
+ /* non-merge rx header is 2 bytes shorter */
+ sc->rx_vhdrlen -= 2;
+ }
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c
new file mode 100644
index 0000000000..5f470c03a6
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c
@@ -0,0 +1,209 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * virtio entropy device emulation.
+ * Randomness is sourced from /dev/random which does not block
+ * once it has been seeded at bootup.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/linker_set.h>
+#include <sys/uio.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sysexits.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTRND_RINGSZ 64
+
+
+static int pci_vtrnd_debug;
+#define DPRINTF(params) if (pci_vtrnd_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtrnd_softc {
+ struct virtio_softc vrsc_vs;
+ struct vqueue_info vrsc_vq;
+ pthread_mutex_t vrsc_mtx;
+ uint64_t vrsc_cfg;
+ int vrsc_fd;
+};
+
+static void pci_vtrnd_reset(void *);
+static void pci_vtrnd_notify(void *, struct vqueue_info *);
+
+static struct virtio_consts vtrnd_vi_consts = {
+ "vtrnd", /* our name */
+ 1, /* we support 1 virtqueue */
+ 0, /* config reg size */
+ pci_vtrnd_reset, /* reset */
+ pci_vtrnd_notify, /* device-wide qnotify */
+ NULL, /* read virtio config */
+ NULL, /* write virtio config */
+ NULL, /* apply negotiated features */
+ 0, /* our capabilities */
+};
+
+
+static void
+pci_vtrnd_reset(void *vsc)
+{
+ struct pci_vtrnd_softc *sc;
+
+ sc = vsc;
+
+ DPRINTF(("vtrnd: device reset requested !\n"));
+ vi_reset_dev(&sc->vrsc_vs);
+}
+
+
+static void
+pci_vtrnd_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct iovec iov;
+ struct pci_vtrnd_softc *sc;
+ int len;
+ uint16_t idx;
+
+ sc = vsc;
+
+ if (sc->vrsc_fd < 0) {
+ vq_endchains(vq, 0);
+ return;
+ }
+
+ while (vq_has_descs(vq)) {
+ vq_getchain(vq, &idx, &iov, 1, NULL);
+
+ len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len);
+
+ DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len));
+
+ /* Catastrophe if unable to read from /dev/random */
+ assert(len > 0);
+
+ /*
+ * Release this chain and handle more
+ */
+ vq_relchain(vq, idx, len);
+ }
+ vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
+}
+
+
+static int
+pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_vtrnd_softc *sc;
+ int fd;
+ int len;
+ uint8_t v;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ /*
+ * Should always be able to open /dev/random.
+ */
+ fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+
+ assert(fd >= 0);
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_READ);
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Check that device is seeded and non-blocking.
+ */
+ len = read(fd, &v, sizeof(v));
+ if (len <= 0) {
+ WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len));
+ close(fd);
+ return (1);
+ }
+
+ sc = calloc(1, sizeof(struct pci_vtrnd_softc));
+
+ vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq);
+ sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx;
+
+ sc->vrsc_vq.vq_qsize = VTRND_RINGSZ;
+
+ /* keep /dev/random opened while emulating */
+ sc->vrsc_fd = fd;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix()))
+ return (1);
+ vi_set_io_bar(&sc->vrsc_vs, 0);
+
+ return (0);
+}
+
+
+struct pci_devemu pci_de_vrnd = {
+ .pe_emu = "virtio-rnd",
+ .pe_init = pci_vtrnd_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vrnd);
diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c
new file mode 100644
index 0000000000..238f07398b
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c
@@ -0,0 +1,737 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_message.h>
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_backend.h>
+#include <cam/ctl/ctl_ioctl.h>
+#include <cam/ctl/ctl_util.h>
+#include <cam/ctl/ctl_scsi_all.h>
+#include <camlib.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+#include "iov.h"
+
+#define VTSCSI_RINGSZ 64
+#define VTSCSI_REQUESTQ 1
+#define VTSCSI_THR_PER_Q 16
+#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2)
+#define VTSCSI_MAXSEG 64
+
+#define VTSCSI_IN_HEADER_LEN(_sc) \
+ (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size)
+
+#define VTSCSI_OUT_HEADER_LEN(_sc) \
+ (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size)
+
+#define VIRTIO_SCSI_MAX_CHANNEL 0
+#define VIRTIO_SCSI_MAX_TARGET 0
+#define VIRTIO_SCSI_MAX_LUN 16383
+
+#define VIRTIO_SCSI_F_INOUT (1 << 0)
+#define VIRTIO_SCSI_F_HOTPLUG (1 << 1)
+#define VIRTIO_SCSI_F_CHANGE (1 << 2)
+
+static int pci_vtscsi_debug = 0;
+#define DPRINTF(params) if (pci_vtscsi_debug) printf params
+#define WPRINTF(params) printf params
+
+struct pci_vtscsi_config {
+ uint32_t num_queues;
+ uint32_t seg_max;
+ uint32_t max_sectors;
+ uint32_t cmd_per_lun;
+ uint32_t event_info_size;
+ uint32_t sense_size;
+ uint32_t cdb_size;
+ uint16_t max_channel;
+ uint16_t max_target;
+ uint32_t max_lun;
+} __attribute__((packed));
+
+struct pci_vtscsi_queue {
+ struct pci_vtscsi_softc * vsq_sc;
+ struct vqueue_info * vsq_vq;
+ pthread_mutex_t vsq_mtx;
+ pthread_mutex_t vsq_qmtx;
+ pthread_cond_t vsq_cv;
+ STAILQ_HEAD(, pci_vtscsi_request) vsq_requests;
+ LIST_HEAD(, pci_vtscsi_worker) vsq_workers;
+};
+
+struct pci_vtscsi_worker {
+ struct pci_vtscsi_queue * vsw_queue;
+ pthread_t vsw_thread;
+ bool vsw_exiting;
+ LIST_ENTRY(pci_vtscsi_worker) vsw_link;
+};
+
+struct pci_vtscsi_request {
+ struct pci_vtscsi_queue * vsr_queue;
+ struct iovec vsr_iov_in[VTSCSI_MAXSEG];
+ int vsr_niov_in;
+ struct iovec vsr_iov_out[VTSCSI_MAXSEG];
+ int vsr_niov_out;
+ uint32_t vsr_idx;
+ STAILQ_ENTRY(pci_vtscsi_request) vsr_link;
+};
+
+/*
+ * Per-device softc
+ */
+struct pci_vtscsi_softc {
+ struct virtio_softc vss_vs;
+ struct vqueue_info vss_vq[VTSCSI_MAXQ];
+ struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ];
+ pthread_mutex_t vss_mtx;
+ int vss_iid;
+ int vss_ctl_fd;
+ uint32_t vss_features;
+ struct pci_vtscsi_config vss_config;
+};
+
+#define VIRTIO_SCSI_T_TMF 0
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1
+#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2
+#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3
+#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4
+#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7
+
+/* command-specific response values */
+#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0
+#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10
+#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11
+
+struct pci_vtscsi_ctrl_tmf {
+ uint32_t type;
+ uint32_t subtype;
+ uint8_t lun[8];
+ uint64_t id;
+ uint8_t response;
+} __attribute__((packed));
+
+#define VIRTIO_SCSI_T_AN_QUERY 1
+#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
+#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4
+#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8
+#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16
+#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32
+#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64
+
+struct pci_vtscsi_ctrl_an {
+ uint32_t type;
+ uint8_t lun[8];
+ uint32_t event_requested;
+ uint32_t event_actual;
+ uint8_t response;
+} __attribute__((packed));
+
+/* command-specific response values */
+#define VIRTIO_SCSI_S_OK 0
+#define VIRTIO_SCSI_S_OVERRUN 1
+#define VIRTIO_SCSI_S_ABORTED 2
+#define VIRTIO_SCSI_S_BAD_TARGET 3
+#define VIRTIO_SCSI_S_RESET 4
+#define VIRTIO_SCSI_S_BUSY 5
+#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
+#define VIRTIO_SCSI_S_TARGET_FAILURE 7
+#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
+#define VIRTIO_SCSI_S_FAILURE 9
+#define VIRTIO_SCSI_S_INCORRECT_LUN 12
+
+/* task_attr */
+#define VIRTIO_SCSI_S_SIMPLE 0
+#define VIRTIO_SCSI_S_ORDERED 1
+#define VIRTIO_SCSI_S_HEAD 2
+#define VIRTIO_SCSI_S_ACA 3
+
+struct pci_vtscsi_event {
+ uint32_t event;
+ uint8_t lun[8];
+ uint32_t reason;
+} __attribute__((packed));
+
+struct pci_vtscsi_req_cmd_rd {
+ uint8_t lun[8];
+ uint64_t id;
+ uint8_t task_attr;
+ uint8_t prio;
+ uint8_t crn;
+ uint8_t cdb[];
+} __attribute__((packed));
+
+struct pci_vtscsi_req_cmd_wr {
+ uint32_t sense_len;
+ uint32_t residual;
+ uint16_t status_qualifier;
+ uint8_t status;
+ uint8_t response;
+ uint8_t sense[];
+} __attribute__((packed));
+
+static void *pci_vtscsi_proc(void *);
+static void pci_vtscsi_reset(void *);
+static void pci_vtscsi_neg_features(void *, uint64_t);
+static int pci_vtscsi_cfgread(void *, int, int, uint32_t *);
+static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t);
+static inline int pci_vtscsi_get_lun(uint8_t *);
+static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t);
+static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *,
+ struct pci_vtscsi_ctrl_tmf *);
+static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *,
+ struct pci_vtscsi_ctrl_an *);
+static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *,
+ int, struct iovec *, int);
+static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *);
+static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *);
+static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *);
+static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *,
+ struct pci_vtscsi_queue *, int);
+static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *);
+
+static struct virtio_consts vtscsi_vi_consts = {
+ "vtscsi", /* our name */
+ VTSCSI_MAXQ, /* we support 2+n virtqueues */
+ sizeof(struct pci_vtscsi_config), /* config reg size */
+ pci_vtscsi_reset, /* reset */
+ NULL, /* device-wide qnotify */
+ pci_vtscsi_cfgread, /* read virtio config */
+ pci_vtscsi_cfgwrite, /* write virtio config */
+ pci_vtscsi_neg_features, /* apply negotiated features */
+ 0, /* our capabilities */
+};
+
+static void *
+pci_vtscsi_proc(void *arg)
+{
+ struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg;
+ struct pci_vtscsi_queue *q = worker->vsw_queue;
+ struct pci_vtscsi_request *req;
+ int iolen;
+
+ for (;;) {
+ pthread_mutex_lock(&q->vsq_mtx);
+
+ while (STAILQ_EMPTY(&q->vsq_requests)
+ && !worker->vsw_exiting)
+ pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx);
+
+ if (worker->vsw_exiting)
+ break;
+
+ req = STAILQ_FIRST(&q->vsq_requests);
+ STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link);
+
+ pthread_mutex_unlock(&q->vsq_mtx);
+ iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in,
+ req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out);
+
+ pthread_mutex_lock(&q->vsq_qmtx);
+ vq_relchain(q->vsq_vq, req->vsr_idx, iolen);
+ vq_endchains(q->vsq_vq, 0);
+ pthread_mutex_unlock(&q->vsq_qmtx);
+
+ DPRINTF(("virtio-scsi: request <idx=%d> completed\n",
+ req->vsr_idx));
+ free(req);
+ }
+
+ pthread_mutex_unlock(&q->vsq_mtx);
+ return (NULL);
+}
+
+static void
+pci_vtscsi_reset(void *vsc)
+{
+ struct pci_vtscsi_softc *sc;
+
+ sc = vsc;
+
+ DPRINTF(("vtscsi: device reset requested\n"));
+ vi_reset_dev(&sc->vss_vs);
+
+ /* initialize config structure */
+ sc->vss_config = (struct pci_vtscsi_config){
+ .num_queues = VTSCSI_REQUESTQ,
+ .seg_max = VTSCSI_MAXSEG,
+ .max_sectors = 2,
+ .cmd_per_lun = 1,
+ .event_info_size = sizeof(struct pci_vtscsi_event),
+ .sense_size = 96,
+ .cdb_size = 32,
+ .max_channel = VIRTIO_SCSI_MAX_CHANNEL,
+ .max_target = VIRTIO_SCSI_MAX_TARGET,
+ .max_lun = VIRTIO_SCSI_MAX_LUN
+ };
+}
+
+static void
+pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features)
+{
+ struct pci_vtscsi_softc *sc = vsc;
+
+ sc->vss_features = negotiated_features;
+}
+
+static int
+pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtscsi_softc *sc = vsc;
+ void *ptr;
+
+ ptr = (uint8_t *)&sc->vss_config + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+static int
+pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val)
+{
+
+ return (0);
+}
+
+static inline int
+pci_vtscsi_get_lun(uint8_t *lun)
+{
+
+ return (((lun[2] << 8) | lun[3]) & 0x3fff);
+}
+
+static int
+pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf,
+ size_t bufsize)
+{
+ struct pci_vtscsi_ctrl_tmf *tmf;
+ struct pci_vtscsi_ctrl_an *an;
+ uint32_t type;
+
+ type = *(uint32_t *)buf;
+
+ if (type == VIRTIO_SCSI_T_TMF) {
+ tmf = (struct pci_vtscsi_ctrl_tmf *)buf;
+ return (pci_vtscsi_tmf_handle(sc, tmf));
+ }
+
+ if (type == VIRTIO_SCSI_T_AN_QUERY) {
+ an = (struct pci_vtscsi_ctrl_an *)buf;
+ return (pci_vtscsi_an_handle(sc, an));
+ }
+
+ return (0);
+}
+
+static int
+pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc,
+ struct pci_vtscsi_ctrl_tmf *tmf)
+{
+ union ctl_io *io;
+ int err;
+
+ io = ctl_scsi_alloc_io(sc->vss_iid);
+ ctl_scsi_zero_io(io);
+
+ io->io_hdr.io_type = CTL_IO_TASK;
+ io->io_hdr.nexus.initid = sc->vss_iid;
+ io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun);
+ io->taskio.tag_type = CTL_TAG_SIMPLE;
+ io->taskio.tag_num = (uint32_t)tmf->id;
+
+ switch (tmf->subtype) {
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK:
+ io->taskio.task_action = CTL_TASK_ABORT_TASK;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
+ io->taskio.task_action = CTL_TASK_ABORT_TASK_SET;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
+ io->taskio.task_action = CTL_TASK_CLEAR_ACA;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
+ io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
+ io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+ io->taskio.task_action = CTL_TASK_LUN_RESET;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_QUERY_TASK:
+ io->taskio.task_action = CTL_TASK_QUERY_TASK;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
+ io->taskio.task_action = CTL_TASK_QUERY_TASK_SET;
+ break;
+ }
+
+ if (pci_vtscsi_debug) {
+ struct sbuf *sb = sbuf_new_auto();
+ ctl_io_sbuf(io, sb);
+ sbuf_finish(sb);
+ DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
+ sbuf_delete(sb);
+ }
+
+ err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
+ if (err != 0)
+ WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
+
+ tmf->response = io->taskio.task_status;
+ ctl_scsi_free_io(io);
+ return (1);
+}
+
+static int
+pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc,
+ struct pci_vtscsi_ctrl_an *an)
+{
+
+ return (0);
+}
+
+static int
+pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
+ int niov_in, struct iovec *iov_out, int niov_out)
+{
+ struct pci_vtscsi_softc *sc = q->vsq_sc;
+ struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL;
+ struct pci_vtscsi_req_cmd_wr *cmd_wr;
+ struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG];
+ union ctl_io *io;
+ int data_niov_in, data_niov_out;
+ void *ext_data_ptr = NULL;
+ uint32_t ext_data_len = 0, ext_sg_entries = 0;
+ int err;
+
+ seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in,
+ VTSCSI_IN_HEADER_LEN(sc));
+ seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out,
+ VTSCSI_OUT_HEADER_LEN(sc));
+
+ truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc));
+ truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc));
+ iov_to_buf(iov_in, niov_in, (void **)&cmd_rd);
+
+ cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc));
+ io = ctl_scsi_alloc_io(sc->vss_iid);
+ ctl_scsi_zero_io(io);
+
+ io->io_hdr.nexus.initid = sc->vss_iid;
+ io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun);
+
+ io->io_hdr.io_type = CTL_IO_SCSI;
+
+ if (data_niov_in > 0) {
+ ext_data_ptr = (void *)data_iov_in;
+ ext_sg_entries = data_niov_in;
+ ext_data_len = count_iov(data_iov_in, data_niov_in);
+ io->io_hdr.flags |= CTL_FLAG_DATA_OUT;
+ } else if (data_niov_out > 0) {
+ ext_data_ptr = (void *)data_iov_out;
+ ext_sg_entries = data_niov_out;
+ ext_data_len = count_iov(data_iov_out, data_niov_out);
+ io->io_hdr.flags |= CTL_FLAG_DATA_IN;
+ }
+
+ io->scsiio.sense_len = sc->vss_config.sense_size;
+ io->scsiio.tag_num = (uint32_t)cmd_rd->id;
+ switch (cmd_rd->task_attr) {
+ case VIRTIO_SCSI_S_ORDERED:
+ io->scsiio.tag_type = CTL_TAG_ORDERED;
+ break;
+ case VIRTIO_SCSI_S_HEAD:
+ io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE;
+ break;
+ case VIRTIO_SCSI_S_ACA:
+ io->scsiio.tag_type = CTL_TAG_ACA;
+ break;
+ case VIRTIO_SCSI_S_SIMPLE:
+ default:
+ io->scsiio.tag_type = CTL_TAG_SIMPLE;
+ break;
+ }
+ io->scsiio.ext_sg_entries = ext_sg_entries;
+ io->scsiio.ext_data_ptr = ext_data_ptr;
+ io->scsiio.ext_data_len = ext_data_len;
+ io->scsiio.ext_data_filled = 0;
+ io->scsiio.cdb_len = sc->vss_config.cdb_size;
+ memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size);
+
+ if (pci_vtscsi_debug) {
+ struct sbuf *sb = sbuf_new_auto();
+ ctl_io_sbuf(io, sb);
+ sbuf_finish(sb);
+ DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
+ sbuf_delete(sb);
+ }
+
+ err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
+ if (err != 0) {
+ WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
+ cmd_wr->response = VIRTIO_SCSI_S_FAILURE;
+ } else {
+ cmd_wr->sense_len = MIN(io->scsiio.sense_len,
+ sc->vss_config.sense_size);
+ cmd_wr->residual = io->scsiio.residual;
+ cmd_wr->status = io->scsiio.scsi_status;
+ cmd_wr->response = VIRTIO_SCSI_S_OK;
+ memcpy(&cmd_wr->sense, &io->scsiio.sense_data,
+ cmd_wr->sense_len);
+ }
+
+ buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0);
+ free(cmd_rd);
+ free(cmd_wr);
+ ctl_scsi_free_io(io);
+ return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled);
+}
+
+static void
+pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtscsi_softc *sc;
+ struct iovec iov[VTSCSI_MAXSEG];
+ uint16_t idx, n;
+ void *buf = NULL;
+ size_t bufsize;
+ int iolen;
+
+ sc = vsc;
+
+ while (vq_has_descs(vq)) {
+ n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL);
+ bufsize = iov_to_buf(iov, n, &buf);
+ iolen = pci_vtscsi_control_handle(sc, buf, bufsize);
+ buf_to_iov(buf + bufsize - iolen, iolen, iov, n,
+ bufsize - iolen);
+
+ /*
+ * Release this chain and handle more
+ */
+ vq_relchain(vq, idx, iolen);
+ }
+ vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
+ free(buf);
+}
+
+static void
+pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq)
+{
+
+ vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+}
+
+static void
+pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtscsi_softc *sc;
+ struct pci_vtscsi_queue *q;
+ struct pci_vtscsi_request *req;
+ struct iovec iov[VTSCSI_MAXSEG];
+ uint16_t flags[VTSCSI_MAXSEG];
+ uint16_t idx, n, i;
+ int readable;
+
+ sc = vsc;
+ q = &sc->vss_queues[vq->vq_num - 2];
+
+ while (vq_has_descs(vq)) {
+ readable = 0;
+ n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags);
+
+ /* Count readable descriptors */
+ for (i = 0; i < n; i++) {
+ if (flags[i] & VRING_DESC_F_WRITE)
+ break;
+
+ readable++;
+ }
+
+ req = calloc(1, sizeof(struct pci_vtscsi_request));
+ req->vsr_idx = idx;
+ req->vsr_queue = q;
+ req->vsr_niov_in = readable;
+ req->vsr_niov_out = n - readable;
+ memcpy(req->vsr_iov_in, iov,
+ req->vsr_niov_in * sizeof(struct iovec));
+ memcpy(req->vsr_iov_out, iov + readable,
+ req->vsr_niov_out * sizeof(struct iovec));
+
+ pthread_mutex_lock(&q->vsq_mtx);
+ STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link);
+ pthread_cond_signal(&q->vsq_cv);
+ pthread_mutex_unlock(&q->vsq_mtx);
+
+ DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx));
+ }
+}
+
+static int
+pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc,
+ struct pci_vtscsi_queue *queue, int num)
+{
+ struct pci_vtscsi_worker *worker;
+ char threadname[16];
+ int i;
+
+ queue->vsq_sc = sc;
+ queue->vsq_vq = &sc->vss_vq[num + 2];
+
+ pthread_mutex_init(&queue->vsq_mtx, NULL);
+ pthread_mutex_init(&queue->vsq_qmtx, NULL);
+ pthread_cond_init(&queue->vsq_cv, NULL);
+ STAILQ_INIT(&queue->vsq_requests);
+ LIST_INIT(&queue->vsq_workers);
+
+ for (i = 0; i < VTSCSI_THR_PER_Q; i++) {
+ worker = calloc(1, sizeof(struct pci_vtscsi_worker));
+ worker->vsw_queue = queue;
+
+ pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc,
+ (void *)worker);
+
+ sprintf(threadname, "virtio-scsi:%d-%d", num, i);
+ pthread_set_name_np(worker->vsw_thread, threadname);
+ LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link);
+ }
+
+ return (0);
+}
+
+static int
+pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_vtscsi_softc *sc;
+ char *opt, *optname;
+ const char *devname;
+ int i, optidx = 0;
+
+ sc = calloc(1, sizeof(struct pci_vtscsi_softc));
+ devname = "/dev/cam/ctl";
+ while ((opt = strsep(&opts, ",")) != NULL) {
+ optname = strsep(&opt, "=");
+ if (opt == NULL && optidx == 0) {
+ if (optname[0] != 0)
+ devname = optname;
+ } else if (strcmp(optname, "dev") == 0 && opt != NULL) {
+ devname = opt;
+ } else if (strcmp(optname, "iid") == 0 && opt != NULL) {
+ sc->vss_iid = strtoul(opt, NULL, 10);
+ } else {
+ fprintf(stderr, "Invalid option %s\n", optname);
+ free(sc);
+ return (1);
+ }
+ optidx++;
+ }
+
+ sc->vss_ctl_fd = open(devname, O_RDWR);
+ if (sc->vss_ctl_fd < 0) {
+ WPRINTF(("cannot open %s: %s\n", devname, strerror(errno)));
+ free(sc);
+ return (1);
+ }
+
+ vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq);
+ sc->vss_vs.vs_mtx = &sc->vss_mtx;
+
+ /* controlq */
+ sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ;
+ sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify;
+
+ /* eventq */
+ sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ;
+ sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify;
+
+ /* request queues */
+ for (i = 2; i < VTSCSI_MAXQ; i++) {
+ sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ;
+ sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify;
+ pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2);
+ }
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix()))
+ return (1);
+ vi_set_io_bar(&sc->vss_vs, 0);
+
+ return (0);
+}
+
+
+struct pci_devemu pci_de_vscsi = {
+ .pe_emu = "virtio-scsi",
+ .pe_init = pci_vtscsi_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vscsi);
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
new file mode 100644
index 0000000000..26c8cdeeba
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -0,0 +1,837 @@
+/*
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/ioctl.h>
+#include <sys/viona_io.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <poll.h>
+#include <libdladm.h>
+#include <libdllink.h>
+#include <libdlvnic.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VIONA_RINGSZ 1024
+
+/*
+ * PCI config-space register offsets
+ */
+#define VIONA_R_CFG0 24
+#define VIONA_R_CFG1 25
+#define VIONA_R_CFG2 26
+#define VIONA_R_CFG3 27
+#define VIONA_R_CFG4 28
+#define VIONA_R_CFG5 29
+#define VIONA_R_CFG6 30
+#define VIONA_R_CFG7 31
+#define VIONA_R_MAX 31
+
+#define VIONA_REGSZ VIONA_R_MAX+1
+
+/*
+ * Queue definitions.
+ */
+#define VIONA_RXQ 0
+#define VIONA_TXQ 1
+#define VIONA_CTLQ 2
+
+#define VIONA_MAXQ 3
+
+/*
+ * Debug printf
+ */
+static volatile int pci_viona_debug;
+#define DPRINTF(params) if (pci_viona_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_viona_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+
+ datalink_id_t vsc_linkid;
+ int vsc_vnafd;
+
+ /* Configurable parameters */
+ char vsc_linkname[MAXLINKNAMELEN];
+ uint32_t vsc_feature_mask;
+ uint16_t vsc_vq_size;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VIONA_MAXQ];
+ uint16_t vsc_msix_table_idx[VIONA_MAXQ];
+ boolean_t vsc_msix_active;
+};
+
+/*
+ * Return the size of IO BAR that maps virtio header and device specific
+ * region. The size would vary depending on whether MSI-X is enabled or
+ * not.
+ */
+static uint64_t
+pci_viona_iosize(struct pci_devinst *pi)
+{
+ if (pci_msix_enabled(pi))
+ return (VIONA_REGSZ);
+ else
+ return (VIONA_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+}
+
+static uint16_t
+pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VIONA_CTLQ) {
+ return (0);
+ }
+
+ return (sc->vsc_vq_size);
+}
+
+static void
+pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
+{
+ assert(ring < VIONA_MAXQ);
+
+ switch (ring) {
+ case VIONA_RXQ:
+ case VIONA_TXQ:
+ break;
+ case VIONA_CTLQ:
+ default:
+ return;
+ }
+
+ for (;;) {
+ int res;
+
+ res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
+ if (res == 0) {
+ break;
+ } else if (errno != EINTR) {
+ WPRINTF(("ioctl viona ring %d reset failed %d\n",
+ ring, errno));
+ return;
+ }
+ }
+
+ sc->vsc_pfn[ring] = 0;
+}
+
+static void
+pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value)
+{
+
+ if (value == 0) {
+ DPRINTF(("viona: device reset requested !\n"));
+ pci_viona_ring_reset(sc, VIONA_RXQ);
+ pci_viona_ring_reset(sc, VIONA_TXQ);
+ }
+
+ sc->vsc_status = value;
+}
+
+static void *
+pci_viona_poll_thread(void *param)
+{
+ struct pci_viona_softc *sc = param;
+ pollfd_t pollset;
+ const int fd = sc->vsc_vnafd;
+
+ pollset.fd = fd;
+ pollset.events = POLLRDBAND;
+
+ for (;;) {
+ if (poll(&pollset, 1, -1) < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ } else {
+ WPRINTF(("pci_viona_poll_thread poll()"
+ "error %d\n", errno));
+ break;
+ }
+ }
+ if (pollset.revents & POLLRDBAND) {
+ vioc_intr_poll_t vip;
+ uint_t i;
+ int res;
+ boolean_t assert_lintr = B_FALSE;
+ const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi);
+
+ res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
+ for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
+ if (vip.vip_status[i] == 0) {
+ continue;
+ }
+ if (do_msix) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[i]);
+ } else {
+ assert_lintr = B_TRUE;
+ }
+ res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
+ if (res != 0) {
+ WPRINTF(("ioctl viona vq %d intr "
+ "clear failed %d\n", i, errno));
+ }
+ }
+ if (assert_lintr) {
+ pthread_mutex_lock(&sc->vsc_mtx);
+ sc->vsc_isr |= VTCFG_ISR_QUEUES;
+ pci_lintr_assert(sc->vsc_pi);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ }
+ }
+ }
+
+ pthread_exit(NULL);
+}
+
+static void
+pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
+{
+ int qnum = sc->vsc_curq;
+ vioc_ring_init_t vna_ri;
+ int error;
+
+ assert(qnum < VIONA_MAXQ);
+
+ if (qnum == VIONA_CTLQ) {
+ return;
+ }
+
+ sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
+
+ vna_ri.ri_index = qnum;
+ vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
+ vna_ri.ri_qaddr = (pfn << VRING_PFN);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
+
+ if (error != 0) {
+ WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno));
+ }
+}
+
+static int
+pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
+{
+ vioc_create_t vna_create;
+ int error;
+
+ sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
+ if (sc->vsc_vnafd == -1) {
+ WPRINTF(("open viona ctl failed: %d\n", errno));
+ return (-1);
+ }
+
+ vna_create.c_linkid = sc->vsc_linkid;
+ vna_create.c_vmfd = vm_get_device_fd(ctx);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
+ if (error != 0) {
+ (void) close(sc->vsc_vnafd);
+ WPRINTF(("ioctl viona create failed %d\n", errno));
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts)
+{
+ char *next, *cp, *vnic = NULL;
+ int err = 0;
+
+ sc->vsc_vq_size = VIONA_RINGSZ;
+ sc->vsc_feature_mask = 0;
+
+ for (; opts != NULL && *opts != '\0'; opts = next) {
+ char *val;
+
+ if ((cp = strchr(opts, ',')) != NULL) {
+ *cp = '\0';
+ next = cp + 1;
+ } else {
+ next = NULL;
+ }
+
+ if ((cp = strchr(opts, '=')) == NULL) {
+ /* vnic chosen with bare name */
+ if (vnic != NULL) {
+ fprintf(stderr,
+ "viona: unexpected vnic name '%s'", opts);
+ err = -1;
+ } else {
+ vnic = opts;
+ }
+ continue;
+ }
+
+ /* <param>=<value> handling */
+ val = cp + 1;
+ *cp = '\0';
+ if (strcmp(opts, "feature_mask") == 0) {
+ long num;
+
+ errno = 0;
+ num = strtol(val, NULL, 0);
+ if (errno != 0 || num < 0) {
+ fprintf(stderr,
+ "viona: invalid mask '%s'", val);
+ } else {
+ sc->vsc_feature_mask = num;
+ }
+ } else if (strcmp(opts, "vqsize") == 0) {
+ long num;
+
+ errno = 0;
+ num = strtol(val, NULL, 0);
+ if (errno != 0) {
+ fprintf(stderr,
+ "viona: invalid vsqize '%s'", val);
+ err = -1;
+ } else if (num <= 2 || num > 32768) {
+ fprintf(stderr,
+ "viona: vqsize out of range", num);
+ err = -1;
+ } else if ((1 << (ffs(num) - 1)) != num) {
+ fprintf(stderr,
+ "viona: vqsize must be power of 2", num);
+ err = -1;
+ } else {
+ sc->vsc_vq_size = num;
+ }
+ } else {
+ fprintf(stderr,
+ "viona: unrecognized option '%s'", opts);
+ err = -1;
+ }
+ }
+ if (vnic == NULL) {
+ fprintf(stderr, "viona: vnic name required");
+ sc->vsc_linkname[0] = '\0';
+ err = -1;
+ } else {
+ (void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN);
+ }
+
+ DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc,
+ sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask));
+ return (err);
+}
+
+static int
+pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ dladm_handle_t handle;
+ dladm_status_t status;
+ dladm_vnic_attr_t attr;
+ char errmsg[DLADM_STRSIZE];
+ int error, i;
+ struct pci_viona_softc *sc;
+ uint64_t ioport;
+
+ if (opts == NULL) {
+ printf("virtio-viona: vnic required\n");
+ return (1);
+ }
+
+ sc = malloc(sizeof (struct pci_viona_softc));
+ memset(sc, 0, sizeof (struct pci_viona_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ if (pci_viona_parse_opts(sc, opts) != 0) {
+ free(sc);
+ return (1);
+ }
+
+ if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
+ WPRINTF(("could not open /dev/dld"));
+ free(sc);
+ return (1);
+ }
+
+ if (dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid,
+ NULL, NULL, NULL) != DLADM_STATUS_OK) {
+ WPRINTF(("dladm_name2info() for %s failed: %s\n", opts,
+ dladm_status2str(status, errmsg)));
+ dladm_close(handle);
+ free(sc);
+ return (1);
+ }
+
+ if (dladm_vnic_info(handle, sc->vsc_linkid, &attr,
+ DLADM_OPT_ACTIVE) != DLADM_STATUS_OK) {
+ WPRINTF(("dladm_vnic_info() for %s failed: %s\n", opts,
+ dladm_status2str(status, errmsg)));
+ dladm_close(handle);
+ free(sc);
+ return (1);
+ }
+
+ memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
+
+ dladm_close(handle);
+
+ error = pci_viona_viona_init(ctx, sc);
+ if (error != 0) {
+ free(sc);
+ return (1);
+ }
+
+ error = pthread_create(NULL, NULL, pci_viona_poll_thread, sc);
+ assert(error == 0);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ /* MSI-X support */
+ for (i = 0; i < VIONA_MAXQ; i++)
+ sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+
+ /* BAR 1 used to map MSI-X table and PBA */
+ if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
+ free(sc);
+ return (1);
+ }
+
+ /* BAR 0 for legacy-style virtio register access. */
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+ if (error != NULL) {
+ WPRINTF(("could not allocate virtio BAR\n"));
+ free(sc);
+ return (1);
+ }
+
+ /* Install ioport hook for virtqueue notification */
+ ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY;
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
+ if (error != 0) {
+ WPRINTF(("could not install ioport hook at %x\n", ioport));
+ free(sc);
+ return (1);
+ }
+
+ /*
+ * Need a legacy interrupt for virtio compliance, even though MSI-X
+ * operation is _strongly_ suggested for adequate performance.
+ */
+ pci_lintr_request(pi);
+
+ return (0);
+}
+
+static uint64_t
+viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
+{
+ /*
+ * Device specific offsets used by guest would change based on
+ * whether MSI-X capability is enabled or not
+ */
+ if (!pci_msix_enabled(pi)) {
+ if (offset >= VTCFG_R_MSIX)
+ return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+ }
+
+ return (offset);
+}
+
+static void
+pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ struct msix_table_entry mte;
+ uint16_t tab_index;
+ vioc_ring_msi_t vrm;
+ int res;
+
+ assert(ring <= VIONA_VQ_TX);
+
+ vrm.rm_index = ring;
+ vrm.rm_addr = 0;
+ vrm.rm_msg = 0;
+ tab_index = sc->vsc_msix_table_idx[ring];
+
+ if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
+ mte = pi->pi_msix.table[tab_index];
+ if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ vrm.rm_addr = mte.addr;
+ vrm.rm_msg = mte.msg_data;
+ }
+ }
+
+ res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
+ if (res != 0) {
+ WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno));
+ }
+}
+
+static void
+pci_viona_lintrupdate(struct pci_devinst *pi)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ boolean_t msix_on = B_FALSE;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
+ if ((sc->vsc_msix_active && !msix_on) ||
+ (msix_on && !sc->vsc_msix_active)) {
+ uint_t i;
+
+ sc->vsc_msix_active = msix_on;
+ /* Update in-kernel ring configs */
+ for (i = 0; i <= VIONA_VQ_TX; i++) {
+ pci_viona_ring_set_msix(pi, i);
+ }
+ }
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ uint_t tab_index, i;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ if (!sc->vsc_msix_active) {
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ return;
+ }
+
+ /*
+ * Rather than update every possible MSI-X vector, cheat and use the
+ * offset to calculate the entry within the table. Since this should
+ * only be called when a write to the table succeeds, the index should
+ * be valid.
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ for (i = 0; i <= VIONA_VQ_TX; i++) {
+ if (sc->vsc_msix_table_idx[i] != tab_index) {
+ continue;
+ }
+ pci_viona_ring_set_msix(pi, i);
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
+{
+ int error;
+
+ switch (ring) {
+ case VIONA_TXQ:
+ case VIONA_RXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
+ if (error != 0) {
+ WPRINTF(("ioctl viona ring %d kick failed %d\n",
+ ring, errno));
+ }
+ break;
+ case VIONA_CTLQ:
+ DPRINTF(("viona: control qnotify!\n"));
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ void *ptr;
+ int err = 0;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
+ pci_viona_msix_update(pi, offset);
+ }
+ return;
+ }
+
+ assert(baridx == 0);
+
+ if (offset + size > pci_viona_iosize(pi)) {
+ DPRINTF(("viona_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ offset = viona_adjust_offset(pi, offset);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value &= ~(sc->vsc_feature_mask);
+ err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
+ if (err != 0) {
+ WPRINTF(("ioctl feature negotiation returned"
+ " err = %d\n", errno));
+ } else {
+ sc->vsc_features = value;
+ }
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_viona_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VIONA_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VIONA_MAXQ);
+ pci_viona_qnotify(sc, value);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_viona_update_status(sc, value);
+ break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ sc->vsc_msix_table_idx[VIONA_CTLQ] = value;
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VIONA_CTLQ);
+ sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+ pci_viona_ring_set_msix(pi, sc->vsc_curq);
+ break;
+ case VIONA_R_CFG0:
+ case VIONA_R_CFG1:
+ case VIONA_R_CFG2:
+ case VIONA_R_CFG3:
+ case VIONA_R_CFG4:
+ case VIONA_R_CFG5:
+ assert((size + offset) <= (VIONA_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ sc->vsc_macaddr[offset - VIONA_R_CFG0] = value;
+ if (size == 1) {
+ *(uint8_t *)ptr = value;
+ } else if (size == 2) {
+ *(uint16_t *)ptr = value;
+ } else {
+ *(uint32_t *)ptr = value;
+ }
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VIONA_R_CFG6:
+ case VIONA_R_CFG7:
+ DPRINTF(("viona: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("viona: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static uint64_t
+pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint64_t value;
+ int err = 0;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ return (pci_emul_msix_tread(pi, offset, size));
+ }
+
+ assert(baridx == 0);
+
+ if (offset + size > pci_viona_iosize(pi)) {
+ DPRINTF(("viona_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ offset = viona_adjust_offset(pi, offset);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
+ if (err != 0) {
+ WPRINTF(("ioctl get host features returned"
+ " err = %d\n", errno));
+ }
+ value &= ~sc->vsc_feature_mask;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_viona_qsize(sc, sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ if (value != 0) {
+ pci_lintr_deassert(pi);
+ }
+ break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ value = sc->vsc_msix_table_idx[VIONA_CTLQ];
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VIONA_CTLQ);
+ value = sc->vsc_msix_table_idx[sc->vsc_curq];
+ break;
+ case VIONA_R_CFG0:
+ case VIONA_R_CFG1:
+ case VIONA_R_CFG2:
+ case VIONA_R_CFG3:
+ case VIONA_R_CFG4:
+ case VIONA_R_CFG5:
+ assert((size + offset) <= (VIONA_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+ if (size == 1) {
+ value = *(uint8_t *)ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *)ptr;
+ } else {
+ value = *(uint32_t *)ptr;
+ }
+ break;
+ case VIONA_R_CFG6:
+ assert(size != 4);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VIONA_R_CFG7:
+ assert(size == 1);
+ value = 0; /* XXX link status in LSB */
+ break;
+ default:
+ DPRINTF(("viona: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_viona = {
+ .pe_emu = "virtio-net-viona",
+ .pe_init = pci_viona_init,
+ .pe_barwrite = pci_viona_write,
+ .pe_barread = pci_viona_read,
+ .pe_lintrupdate = pci_viona_lintrupdate
+};
+PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c
new file mode 100644
index 0000000000..988e6933cc
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_xhci.c
@@ -0,0 +1,2862 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ XHCI options:
+ -s <n>,xhci,{devices}
+
+ devices:
+ tablet USB tablet mouse
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include <dev/usb/usbdi.h>
+#include <dev/usb/usb.h>
+#include <dev/usb/usb_freebsd.h>
+#include <xhcireg.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "pci_xhci.h"
+#include "usb_emul.h"
+
+
+static int xhci_debug = 0;
+#define DPRINTF(params) if (xhci_debug) printf params
+#define WPRINTF(params) printf params
+
+
+#define XHCI_NAME "xhci"
+#define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */
+
+#define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */
+
+/*
+ * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping
+ * to 4k to avoid going over the guest physical memory barrier.
+ */
+#define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */
+
+#define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */
+
+#define XHCI_CAPLEN (4*8) /* offset of op register space */
+#define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */
+#define XHCI_PORTREGS_START 0x400
+#define XHCI_DOORBELL_MAX 256
+
+#define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */
+
+/* caplength and hci-version registers */
+#define XHCI_SET_CAPLEN(x) ((x) & 0xFF)
+#define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16)
+#define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF)
+
+/* hcsparams1 register */
+#define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF)
+#define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8)
+#define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24)
+
+/* hcsparams2 register */
+#define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F)
+#define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4)
+#define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21)
+#define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27)
+
+/* hcsparams3 register */
+#define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF)
+#define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16)
+
+/* hccparams1 register */
+#define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01)
+#define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1)
+#define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2)
+#define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3)
+#define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4)
+#define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5)
+#define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6)
+#define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7)
+#define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8)
+#define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9)
+#define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10)
+#define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11)
+#define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12)
+#define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16)
+
+/* hccparams2 register */
+#define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01)
+#define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1)
+#define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2)
+#define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3)
+#define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4)
+#define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5)
+
+/* other registers */
+#define XHCI_SET_DOORBELL(x) ((x) & ~0x03)
+#define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F)
+
+/* register masks */
+#define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */
+#define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */
+#define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */
+
+/* port register set */
+#define XHCI_PORTREGS_BASE 0x400 /* base offset */
+#define XHCI_PORTREGS_PORT0 0x3F0
+#define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */
+
+#define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL)
+#define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL)
+
+#define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \
+ (((b) & (m)) << (s)))
+#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \
+ (((b) & ((m) << (s)))))
+
+struct pci_xhci_trb_ring {
+ uint64_t ringaddr; /* current dequeue guest address */
+ uint32_t ccs; /* consumer cycle state */
+};
+
+/* device endpoint transfer/stream rings */
+struct pci_xhci_dev_ep {
+ union {
+ struct xhci_trb *_epu_tr;
+ struct xhci_stream_ctx *_epu_sctx;
+ } _ep_trbsctx;
+#define ep_tr _ep_trbsctx._epu_tr
+#define ep_sctx _ep_trbsctx._epu_sctx
+
+ union {
+ struct pci_xhci_trb_ring _epu_trb;
+ struct pci_xhci_trb_ring *_epu_sctx_trbs;
+ } _ep_trb_rings;
+#define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr
+#define ep_ccs _ep_trb_rings._epu_trb.ccs
+#define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs
+
+ struct usb_data_xfer *ep_xfer; /* transfer chain */
+};
+
+/* device context base address array: maps slot->device context */
+struct xhci_dcbaa {
+ uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */
+};
+
+/* port status registers */
+struct pci_xhci_portregs {
+ uint32_t portsc; /* port status and control */
+ uint32_t portpmsc; /* port pwr mgmt status & control */
+ uint32_t portli; /* port link info */
+ uint32_t porthlpmc; /* port hardware LPM control */
+} __packed;
+#define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10)
+
+/* xHC operational registers */
+struct pci_xhci_opregs {
+ uint32_t usbcmd; /* usb command */
+ uint32_t usbsts; /* usb status */
+ uint32_t pgsz; /* page size */
+ uint32_t dnctrl; /* device notification control */
+ uint64_t crcr; /* command ring control */
+ uint64_t dcbaap; /* device ctx base addr array ptr */
+ uint32_t config; /* configure */
+
+ /* guest mapped addresses: */
+ struct xhci_trb *cr_p; /* crcr dequeue */
+ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */
+};
+
+/* xHC runtime registers */
+struct pci_xhci_rtsregs {
+ uint32_t mfindex; /* microframe index */
+ struct { /* interrupter register set */
+ uint32_t iman; /* interrupter management */
+ uint32_t imod; /* interrupter moderation */
+ uint32_t erstsz; /* event ring segment table size */
+ uint32_t rsvd;
+ uint64_t erstba; /* event ring seg-tbl base addr */
+ uint64_t erdp; /* event ring dequeue ptr */
+ } intrreg __packed;
+
+ /* guest mapped addresses */
+ struct xhci_event_ring_seg *erstba_p;
+ struct xhci_trb *erst_p; /* event ring segment tbl */
+ int er_deq_seg; /* event ring dequeue segment */
+ int er_enq_idx; /* event ring enqueue index - xHCI */
+ int er_enq_seg; /* event ring enqueue segment */
+ uint32_t er_events_cnt; /* number of events in ER */
+ uint32_t event_pcs; /* producer cycle state flag */
+};
+
+
+struct pci_xhci_softc;
+
+
+/*
+ * USB device emulation container.
+ * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each
+ * emulated device instance.
+ */
+struct pci_xhci_dev_emu {
+ struct pci_xhci_softc *xsc;
+
+ /* XHCI contexts */
+ struct xhci_dev_ctx *dev_ctx;
+ struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS];
+ int dev_slotstate;
+
+ struct usb_devemu *dev_ue; /* USB emulated dev */
+ void *dev_sc; /* device's softc */
+
+ struct usb_hci hci;
+};
+
+struct pci_xhci_softc {
+ struct pci_devinst *xsc_pi;
+
+ pthread_mutex_t mtx;
+
+ uint32_t caplength; /* caplen & hciversion */
+ uint32_t hcsparams1; /* structural parameters 1 */
+ uint32_t hcsparams2; /* structural parameters 2 */
+ uint32_t hcsparams3; /* structural parameters 3 */
+ uint32_t hccparams1; /* capability parameters 1 */
+ uint32_t dboff; /* doorbell offset */
+ uint32_t rtsoff; /* runtime register space offset */
+ uint32_t hccparams2; /* capability parameters 2 */
+
+ uint32_t regsend; /* end of configuration registers */
+
+ struct pci_xhci_opregs opregs;
+ struct pci_xhci_rtsregs rtsregs;
+
+ struct pci_xhci_portregs *portregs;
+ struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */
+ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */
+ int ndevices;
+
+ int usb2_port_start;
+ int usb3_port_start;
+};
+
+
+/* portregs and devices arrays are set up to start from idx=1 */
+#define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)]
+#define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)]
+#define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)]
+
+#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH)
+
+#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \
+ (a), \
+ XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1)))
+
+static int xhci_in_use;
+
+/* map USB errors to XHCI */
+static const int xhci_usb_errors[USB_ERR_MAX] = {
+ [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS,
+ [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON,
+ [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID,
+ [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED,
+ [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER,
+ [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER,
+ [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER,
+ [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL,
+ [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED,
+ [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED,
+ [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON,
+ [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE,
+ [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB,
+ [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON,
+ [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED,
+ [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT,
+ [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL,
+ [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED,
+ [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF,
+ [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB,
+ [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED,
+ [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED,
+ [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED,
+};
+#define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \
+ XHCI_TRB_ERROR_INVALID)
+
+static int pci_xhci_insert_event(struct pci_xhci_softc *sc,
+ struct xhci_trb *evtrb, int do_intr);
+static void pci_xhci_dump_trb(struct xhci_trb *trb);
+static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc);
+static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot);
+static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm);
+static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc,
+ struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+ struct xhci_endp_ctx *ep_ctx, uint32_t streamid,
+ uint64_t ringaddr, int ccs);
+
+static void
+pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode,
+ uint32_t evtype)
+{
+ evtrb->qwTrb0 = port << 24;
+ evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode);
+ evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype);
+}
+
+
+/* controller reset */
+static void
+pci_xhci_reset(struct pci_xhci_softc *sc)
+{
+ int i;
+
+ sc->rtsregs.er_enq_idx = 0;
+ sc->rtsregs.er_events_cnt = 0;
+ sc->rtsregs.event_pcs = 1;
+
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ pci_xhci_reset_slot(sc, i);
+ }
+}
+
+static uint32_t
+pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd)
+{
+ int do_intr = 0;
+ int i;
+
+ if (cmd & XHCI_CMD_RS) {
+ do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0;
+
+ sc->opregs.usbcmd |= XHCI_CMD_RS;
+ sc->opregs.usbsts &= ~XHCI_STS_HCH;
+ sc->opregs.usbsts |= XHCI_STS_PCD;
+
+ /* Queue port change event on controller run from stop */
+ if (do_intr)
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ struct pci_xhci_dev_emu *dev;
+ struct pci_xhci_portregs *port;
+ struct xhci_trb evtrb;
+
+ if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL)
+ continue;
+
+ port = XHCI_PORTREG_PTR(sc, i);
+ port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS;
+ port->portsc &= ~XHCI_PS_PLS_MASK;
+
+ /*
+ * XHCI 4.19.3 USB2 RxDetect->Polling,
+ * USB3 Polling->U0
+ */
+ if (dev->dev_ue->ue_usbver == 2)
+ port->portsc |=
+ XHCI_PS_PLS_SET(UPS_PORT_LS_POLL);
+ else
+ port->portsc |=
+ XHCI_PS_PLS_SET(UPS_PORT_LS_U0);
+
+ pci_xhci_set_evtrb(&evtrb, i,
+ XHCI_TRB_ERROR_SUCCESS,
+ XHCI_TRB_EVENT_PORT_STS_CHANGE);
+
+ if (pci_xhci_insert_event(sc, &evtrb, 0) !=
+ XHCI_TRB_ERROR_SUCCESS)
+ break;
+ }
+ } else {
+ sc->opregs.usbcmd &= ~XHCI_CMD_RS;
+ sc->opregs.usbsts |= XHCI_STS_HCH;
+ sc->opregs.usbsts &= ~XHCI_STS_PCD;
+ }
+
+ /* start execution of schedule; stop when set to 0 */
+ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS;
+
+ if (cmd & XHCI_CMD_HCRST) {
+ /* reset controller */
+ pci_xhci_reset(sc);
+ cmd &= ~XHCI_CMD_HCRST;
+ }
+
+ cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS);
+
+ if (do_intr)
+ pci_xhci_assert_interrupt(sc);
+
+ return (cmd);
+}
+
+static void
+pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+ uint64_t value)
+{
+ struct xhci_trb evtrb;
+ struct pci_xhci_portregs *p;
+ int port;
+ uint32_t oldpls, newpls;
+
+ if (sc->portregs == NULL)
+ return;
+
+ port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ;
+ offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ;
+
+ DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx\r\n",
+ offset, port, value));
+
+ assert(port >= 0);
+
+ if (port > XHCI_MAX_DEVS) {
+ DPRINTF(("pci_xhci: portregs_write port %d > ndevices\r\n",
+ port));
+ return;
+ }
+
+ if (XHCI_DEVINST_PTR(sc, port) == NULL) {
+ DPRINTF(("pci_xhci: portregs_write to unattached port %d\r\n",
+ port));
+ }
+
+ p = XHCI_PORTREG_PTR(sc, port);
+ switch (offset) {
+ case 0:
+ /* port reset or warm reset */
+ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) {
+ pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR);
+ break;
+ }
+
+ if ((p->portsc & XHCI_PS_PP) == 0) {
+ WPRINTF(("pci_xhci: portregs_write to unpowered "
+ "port %d\r\n", port));
+ break;
+ }
+
+ /* Port status and control register */
+ oldpls = XHCI_PS_PLS_GET(p->portsc);
+ newpls = XHCI_PS_PLS_GET(value);
+
+ p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK |
+ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK;
+
+ if (XHCI_DEVINST_PTR(sc, port))
+ p->portsc |= XHCI_PS_CCS;
+
+ p->portsc |= (value &
+ ~(XHCI_PS_OCA |
+ XHCI_PS_PR |
+ XHCI_PS_PED |
+ XHCI_PS_PLS_MASK | /* link state */
+ XHCI_PS_SPEED_MASK |
+ XHCI_PS_PIC_MASK | /* port indicator */
+ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR));
+
+ /* clear control bits */
+ p->portsc &= ~(value &
+ (XHCI_PS_CSC |
+ XHCI_PS_PEC |
+ XHCI_PS_WRC |
+ XHCI_PS_OCC |
+ XHCI_PS_PRC |
+ XHCI_PS_PLC |
+ XHCI_PS_CEC |
+ XHCI_PS_CAS));
+
+ /* port disable request; for USB3, don't care */
+ if (value & XHCI_PS_PED)
+ DPRINTF(("Disable port %d request\r\n", port));
+
+ if (!(value & XHCI_PS_LWS))
+ break;
+
+ DPRINTF(("Port new PLS: %d\r\n", newpls));
+ switch (newpls) {
+ case 0: /* U0 */
+ case 3: /* U3 */
+ if (oldpls != newpls) {
+ p->portsc &= ~XHCI_PS_PLS_MASK;
+ p->portsc |= XHCI_PS_PLS_SET(newpls) |
+ XHCI_PS_PLC;
+
+ if (oldpls != 0 && newpls == 0) {
+ pci_xhci_set_evtrb(&evtrb, port,
+ XHCI_TRB_ERROR_SUCCESS,
+ XHCI_TRB_EVENT_PORT_STS_CHANGE);
+
+ pci_xhci_insert_event(sc, &evtrb, 1);
+ }
+ }
+ break;
+
+ default:
+ DPRINTF(("Unhandled change port %d PLS %u\r\n",
+ port, newpls));
+ break;
+ }
+ break;
+ case 4:
+ /* Port power management status and control register */
+ p->portpmsc = value;
+ break;
+ case 8:
+ /* Port link information register */
+ DPRINTF(("pci_xhci attempted write to PORTLI, port %d\r\n",
+ port));
+ break;
+ case 12:
+ /*
+ * Port hardware LPM control register.
+ * For USB3, this register is reserved.
+ */
+ p->porthlpmc = value;
+ break;
+ }
+}
+
+struct xhci_dev_ctx *
+pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot)
+{
+ uint64_t devctx_addr;
+ struct xhci_dev_ctx *devctx;
+
+ assert(slot > 0 && slot <= sc->ndevices);
+ assert(sc->opregs.dcbaa_p != NULL);
+
+ devctx_addr = sc->opregs.dcbaa_p->dcba[slot];
+
+ if (devctx_addr == 0) {
+ DPRINTF(("get_dev_ctx devctx_addr == 0\r\n"));
+ return (NULL);
+ }
+
+ DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx\r\n",
+ slot, devctx_addr));
+ devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL);
+
+ return (devctx);
+}
+
+struct xhci_trb *
+pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb,
+ uint64_t *guestaddr)
+{
+ struct xhci_trb *next;
+
+ assert(curtrb != NULL);
+
+ if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) {
+ if (guestaddr)
+ *guestaddr = curtrb->qwTrb0 & ~0xFUL;
+
+ next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL);
+ } else {
+ if (guestaddr)
+ *guestaddr += sizeof(struct xhci_trb) & ~0xFUL;
+
+ next = curtrb + 1;
+ }
+
+ return (next);
+}
+
+static void
+pci_xhci_assert_interrupt(struct pci_xhci_softc *sc)
+{
+
+ sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY;
+ sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND;
+ sc->opregs.usbsts |= XHCI_STS_EINT;
+
+ /* only trigger interrupt if permitted */
+ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) &&
+ (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) {
+ if (pci_msi_enabled(sc->xsc_pi))
+ pci_generate_msi(sc->xsc_pi, 0);
+ else
+ pci_lintr_assert(sc->xsc_pi);
+ }
+}
+
+static void
+pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc)
+{
+
+ if (!pci_msi_enabled(sc->xsc_pi))
+ pci_lintr_assert(sc->xsc_pi);
+}
+
+static void
+pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid)
+{
+ struct xhci_dev_ctx *dev_ctx;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_endp_ctx *ep_ctx;
+ uint32_t pstreams;
+ int i;
+
+ dev_ctx = dev->dev_ctx;
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+ devep = &dev->eps[epid];
+ pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0);
+ if (pstreams > 0) {
+ DPRINTF(("init_ep %d with pstreams %d\r\n", epid, pstreams));
+ assert(devep->ep_sctx_trbs == NULL);
+
+ devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 &
+ XHCI_EPCTX_2_TR_DQ_PTR_MASK);
+ devep->ep_sctx_trbs = calloc(pstreams,
+ sizeof(struct pci_xhci_trb_ring));
+ for (i = 0; i < pstreams; i++) {
+ devep->ep_sctx_trbs[i].ringaddr =
+ devep->ep_sctx[i].qwSctx0 &
+ XHCI_SCTX_0_TR_DQ_PTR_MASK;
+ devep->ep_sctx_trbs[i].ccs =
+ XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0);
+ }
+ } else {
+ DPRINTF(("init_ep %d with no pstreams\r\n", epid));
+ devep->ep_ringaddr = ep_ctx->qwEpCtx2 &
+ XHCI_EPCTX_2_TR_DQ_PTR_MASK;
+ devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2);
+ devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr);
+ DPRINTF(("init_ep tr DCS %x\r\n", devep->ep_ccs));
+ }
+
+ if (devep->ep_xfer == NULL) {
+ devep->ep_xfer = malloc(sizeof(struct usb_data_xfer));
+ USB_DATA_XFER_INIT(devep->ep_xfer);
+ }
+}
+
+static void
+pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid)
+{
+ struct xhci_dev_ctx *dev_ctx;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_endp_ctx *ep_ctx;
+
+ DPRINTF(("pci_xhci disable_ep %d\r\n", epid));
+
+ dev_ctx = dev->dev_ctx;
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+ ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED;
+
+ devep = &dev->eps[epid];
+ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 &&
+ devep->ep_sctx_trbs != NULL)
+ free(devep->ep_sctx_trbs);
+
+ if (devep->ep_xfer != NULL) {
+ free(devep->ep_xfer);
+ devep->ep_xfer = NULL;
+ }
+
+ memset(devep, 0, sizeof(struct pci_xhci_dev_ep));
+}
+
+
+/* reset device at slot and data structures related to it */
+static void
+pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot)
+{
+ struct pci_xhci_dev_emu *dev;
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+
+ if (!dev) {
+ DPRINTF(("xhci reset unassigned slot (%d)?\r\n", slot));
+ } else {
+ dev->dev_slotstate = XHCI_ST_DISABLED;
+ }
+
+ /* TODO: reset ring buffer pointers */
+}
+
+static int
+pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb,
+ int do_intr)
+{
+ struct pci_xhci_rtsregs *rts;
+ uint64_t erdp;
+ int erdp_idx;
+ int err;
+ struct xhci_trb *evtrbptr;
+
+ err = XHCI_TRB_ERROR_SUCCESS;
+
+ rts = &sc->rtsregs;
+
+ erdp = rts->intrreg.erdp & ~0xF;
+ erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) /
+ sizeof(struct xhci_trb);
+
+ DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]\r\n"
+ "\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u\r\n"
+ "\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)\r\n",
+ evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3,
+ erdp_idx, rts->er_deq_seg, rts->er_enq_idx,
+ rts->er_enq_seg,
+ rts->event_pcs, erdp, rts->erstba_p->qwEvrsTablePtr,
+ rts->erstba_p->dwEvrsTableSize, do_intr));
+
+ evtrbptr = &rts->erst_p[rts->er_enq_idx];
+
+ /* TODO: multi-segment table */
+ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) {
+ DPRINTF(("pci_xhci[%d] cannot insert event; ring full\r\n",
+ __LINE__));
+ err = XHCI_TRB_ERROR_EV_RING_FULL;
+ goto done;
+ }
+
+ if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) {
+ struct xhci_trb errev;
+
+ if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) {
+
+ DPRINTF(("pci_xhci[%d] insert evt err: ring full\r\n",
+ __LINE__));
+
+ errev.qwTrb0 = 0;
+ errev.dwTrb2 = XHCI_TRB_2_ERROR_SET(
+ XHCI_TRB_ERROR_EV_RING_FULL);
+ errev.dwTrb3 = XHCI_TRB_3_TYPE_SET(
+ XHCI_TRB_EVENT_HOST_CTRL) |
+ rts->event_pcs;
+ rts->er_events_cnt++;
+ memcpy(&rts->erst_p[rts->er_enq_idx], &errev,
+ sizeof(struct xhci_trb));
+ rts->er_enq_idx = (rts->er_enq_idx + 1) %
+ rts->erstba_p->dwEvrsTableSize;
+ err = XHCI_TRB_ERROR_EV_RING_FULL;
+ do_intr = 1;
+
+ goto done;
+ }
+ } else {
+ rts->er_events_cnt++;
+ }
+
+ evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT;
+ evtrb->dwTrb3 |= rts->event_pcs;
+
+ memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb));
+ rts->er_enq_idx = (rts->er_enq_idx + 1) %
+ rts->erstba_p->dwEvrsTableSize;
+
+ if (rts->er_enq_idx == 0)
+ rts->event_pcs ^= 1;
+
+done:
+ if (do_intr)
+ pci_xhci_assert_interrupt(sc);
+
+ return (err);
+}
+
+static uint32_t
+pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot)
+{
+ struct pci_xhci_dev_emu *dev;
+ uint32_t cmderr;
+ int i;
+
+ cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+ if (sc->portregs != NULL)
+ for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+ dev = XHCI_SLOTDEV_PTR(sc, i);
+ if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) {
+ *slot = i;
+ dev->dev_slotstate = XHCI_ST_ENABLED;
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+ dev->hci.hci_address = i;
+ break;
+ }
+ }
+
+ DPRINTF(("pci_xhci enable slot (error=%d) slot %u\r\n",
+ cmderr != XHCI_TRB_ERROR_SUCCESS, *slot));
+
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot)
+{
+ struct pci_xhci_dev_emu *dev;
+ uint32_t cmderr;
+
+ DPRINTF(("pci_xhci disable slot %u\r\n", slot));
+
+ cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+ if (sc->portregs == NULL)
+ goto done;
+
+ if (slot > sc->ndevices) {
+ cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+ goto done;
+ }
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ if (dev) {
+ if (dev->dev_slotstate == XHCI_ST_DISABLED) {
+ cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+ } else {
+ dev->dev_slotstate = XHCI_ST_DISABLED;
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+ /* TODO: reset events and endpoints */
+ }
+ }
+
+done:
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx;
+ uint32_t cmderr;
+ int i;
+
+ cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+ if (sc->portregs == NULL)
+ goto done;
+
+ DPRINTF(("pci_xhci reset device slot %u\r\n", slot));
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED)
+ cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+ else {
+ dev->dev_slotstate = XHCI_ST_DEFAULT;
+
+ dev->hci.hci_address = 0;
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+ /* slot state */
+ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+ dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT,
+ 0x1F, 27);
+
+ /* number of contexts */
+ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
+ dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
+
+ /* reset all eps other than ep-0 */
+ for (i = 2; i <= 31; i++) {
+ ep_ctx = &dev_ctx->ctx_ep[i];
+ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0,
+ XHCI_ST_EPCTX_DISABLED, 0x7, 0);
+ }
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+ }
+
+ pci_xhci_reset_slot(sc, slot);
+
+done:
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot,
+ struct xhci_trb *trb)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct xhci_input_dev_ctx *input_ctx;
+ struct xhci_slot_ctx *islot_ctx;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep0_ctx;
+ uint32_t cmderr;
+
+ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+ islot_ctx = &input_ctx->ctx_slot;
+ ep0_ctx = &input_ctx->ctx_ep[1];
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+ DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,\r\n"
+ " slot %08x %08x %08x %08x\r\n"
+ " ep0 %08x %08x %016lx %08x\r\n",
+ input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+ islot_ctx->dwSctx0, islot_ctx->dwSctx1,
+ islot_ctx->dwSctx2, islot_ctx->dwSctx3,
+ ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+ ep0_ctx->dwEpCtx4));
+
+ /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */
+ if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
+ (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) {
+ DPRINTF(("pci_xhci: address device, input ctl invalid\r\n"));
+ cmderr = XHCI_TRB_ERROR_TRB;
+ goto done;
+ }
+
+ /* assign address to slot */
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+ DPRINTF(("pci_xhci: address device, dev ctx\r\n"
+ " slot %08x %08x %08x %08x\r\n",
+ dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+ dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ assert(dev != NULL);
+
+ dev->hci.hci_address = slot;
+ dev->dev_ctx = dev_ctx;
+
+ if (dev->dev_ue->ue_reset == NULL ||
+ dev->dev_ue->ue_reset(dev->dev_sc) < 0) {
+ cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
+ goto done;
+ }
+
+ memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx));
+
+ dev_ctx->ctx_slot.dwSctx3 =
+ XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) |
+ XHCI_SCTX_3_DEV_ADDR_SET(slot);
+
+ memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx));
+ ep0_ctx = &dev_ctx->ctx_ep[1];
+ ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) |
+ XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING);
+
+ pci_xhci_init_ep(dev, 1);
+
+ dev->dev_slotstate = XHCI_ST_ADDRESSED;
+
+ DPRINTF(("pci_xhci: address device, output ctx\r\n"
+ " slot %08x %08x %08x %08x\r\n"
+ " ep0 %08x %08x %016lx %08x\r\n",
+ dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+ dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3,
+ ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+ ep0_ctx->dwEpCtx4));
+
+done:
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot,
+ struct xhci_trb *trb)
+{
+ struct xhci_input_dev_ctx *input_ctx;
+ struct pci_xhci_dev_emu *dev;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx, *iep_ctx;
+ uint32_t cmderr;
+ int i;
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+ DPRINTF(("pci_xhci config_ep slot %u\r\n", slot));
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ assert(dev != NULL);
+
+ if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) {
+ DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u\r\n",
+ slot));
+ if (dev->dev_ue->ue_stop != NULL)
+ dev->dev_ue->ue_stop(dev->dev_sc);
+
+ dev->dev_slotstate = XHCI_ST_ADDRESSED;
+
+ dev->hci.hci_address = 0;
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+ /* number of contexts */
+ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
+ dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
+
+ /* slot state */
+ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+ dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED,
+ 0x1F, 27);
+
+ /* disable endpoints */
+ for (i = 2; i < 32; i++)
+ pci_xhci_disable_ep(dev, i);
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+ goto done;
+ }
+
+ if (dev->dev_slotstate < XHCI_ST_ADDRESSED) {
+ DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed\r\n",
+ dev->dev_slotstate));
+ cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+ goto done;
+ }
+
+ /* In addressed/configured state;
+ * for each drop endpoint ctx flag:
+ * ep->state = DISABLED
+ * for each add endpoint ctx flag:
+ * cp(ep-in, ep-out)
+ * ep->state = RUNNING
+ * for each drop+add endpoint flag:
+ * reset ep resources
+ * cp(ep-in, ep-out)
+ * ep->state = RUNNING
+ * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled)
+ * slot->state = configured
+ */
+
+ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+ dev_ctx = dev->dev_ctx;
+ DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x\r\n",
+ input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+ input_ctx->ctx_input.dwInCtx7));
+
+ for (i = 2; i <= 31; i++) {
+ ep_ctx = &dev_ctx->ctx_ep[i];
+
+ if (input_ctx->ctx_input.dwInCtx0 &
+ XHCI_INCTX_0_DROP_MASK(i)) {
+ DPRINTF((" config ep - dropping ep %d\r\n", i));
+ pci_xhci_disable_ep(dev, i);
+ }
+
+ if (input_ctx->ctx_input.dwInCtx1 &
+ XHCI_INCTX_1_ADD_MASK(i)) {
+ iep_ctx = &input_ctx->ctx_ep[i];
+
+ DPRINTF((" enable ep[%d] %08x %08x %016lx %08x\r\n",
+ i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1,
+ iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4));
+
+ memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx));
+
+ pci_xhci_init_ep(dev, i);
+
+ /* ep state */
+ ep_ctx->dwEpCtx0 = FIELD_REPLACE(
+ ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+ }
+ }
+
+ /* slot state to configured */
+ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+ dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27);
+ dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY(
+ dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27);
+ dev->dev_slotstate = XHCI_ST_CONFIGURED;
+
+ DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x "
+ "[3]=0x%08x\r\n",
+ slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+ dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+done:
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot,
+ struct xhci_trb *trb)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx;
+ uint32_t cmderr, epid;
+ uint32_t type;
+
+ epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
+
+ DPRINTF(("pci_xhci: reset ep %u: slot %u\r\n", epid, slot));
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+ type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ assert(dev != NULL);
+
+ if (type == XHCI_TRB_TYPE_STOP_EP &&
+ (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) {
+ /* XXX suspend endpoint for 10ms */
+ }
+
+ if (epid < 1 || epid > 31) {
+ DPRINTF(("pci_xhci: reset ep: invalid epid %u\r\n", epid));
+ cmderr = XHCI_TRB_ERROR_TRB;
+ goto done;
+ }
+
+ devep = &dev->eps[epid];
+ if (devep->ep_xfer != NULL)
+ USB_DATA_XFER_RESET(devep->ep_xfer);
+
+ dev_ctx = dev->dev_ctx;
+ assert(dev_ctx != NULL);
+
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+
+ ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
+
+ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0)
+ ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs;
+
+ DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x\r\n",
+ epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
+ ep_ctx->dwEpCtx4));
+
+ if (type == XHCI_TRB_TYPE_RESET_EP &&
+ (dev->dev_ue->ue_reset == NULL ||
+ dev->dev_ue->ue_reset(dev->dev_sc) < 0)) {
+ cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
+ goto done;
+ }
+
+done:
+ return (cmderr);
+}
+
+
+static uint32_t
+pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep,
+ uint32_t streamid, struct xhci_stream_ctx **osctx)
+{
+ struct xhci_stream_ctx *sctx;
+ uint32_t maxpstreams;
+
+ maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0);
+ if (maxpstreams == 0)
+ return (XHCI_TRB_ERROR_TRB);
+
+ if (maxpstreams > XHCI_STREAMS_MAX)
+ return (XHCI_TRB_ERROR_INVALID_SID);
+
+ if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) {
+ DPRINTF(("pci_xhci: find_stream; LSA bit not set\r\n"));
+ return (XHCI_TRB_ERROR_INVALID_SID);
+ }
+
+ /* only support primary stream */
+ if (streamid > maxpstreams)
+ return (XHCI_TRB_ERROR_STREAM_TYPE);
+
+ sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid;
+ if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0))
+ return (XHCI_TRB_ERROR_STREAM_TYPE);
+
+ *osctx = sctx;
+
+ return (XHCI_TRB_ERROR_SUCCESS);
+}
+
+
+static uint32_t
+pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot,
+ struct xhci_trb *trb)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx;
+ uint32_t cmderr, epid;
+ uint32_t streamid;
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ assert(dev != NULL);
+
+ DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u\r\n"
+ " stream-id %u, slot %u, epid %u, C %u\r\n",
+ (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7),
+ (uint32_t)(trb->qwTrb0 & 0x1), (trb->dwTrb2 >> 16) & 0xFFFF,
+ XHCI_TRB_3_SLOT_GET(trb->dwTrb3),
+ XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1));
+
+ epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
+ if (epid < 1 || epid > 31) {
+ DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u\r\n", epid));
+ cmderr = XHCI_TRB_ERROR_TRB;
+ goto done;
+ }
+
+ dev_ctx = dev->dev_ctx;
+ assert(dev_ctx != NULL);
+
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+ devep = &dev->eps[epid];
+
+ switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) {
+ case XHCI_ST_EPCTX_STOPPED:
+ case XHCI_ST_EPCTX_ERROR:
+ break;
+ default:
+ DPRINTF(("pci_xhci cmd set_tr invalid state %x\r\n",
+ XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)));
+ cmderr = XHCI_TRB_ERROR_CONTEXT_STATE;
+ goto done;
+ }
+
+ streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2);
+ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) {
+ struct xhci_stream_ctx *sctx;
+
+ sctx = NULL;
+ cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx);
+ if (sctx != NULL) {
+ assert(devep->ep_sctx != NULL);
+
+ devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0;
+ devep->ep_sctx_trbs[streamid].ringaddr =
+ trb->qwTrb0 & ~0xF;
+ devep->ep_sctx_trbs[streamid].ccs =
+ XHCI_EPCTX_2_DCS_GET(trb->qwTrb0);
+ }
+ } else {
+ if (streamid != 0) {
+ DPRINTF(("pci_xhci cmd set_tr streamid %x != 0\r\n",
+ streamid));
+ }
+ ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL;
+ devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL;
+ devep->ep_ccs = trb->qwTrb0 & 0x1;
+ devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr);
+
+ DPRINTF(("pci_xhci set_tr first TRB:\r\n"));
+ pci_xhci_dump_trb(devep->ep_tr);
+ }
+ ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
+
+done:
+ return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot,
+ struct xhci_trb *trb)
+{
+ struct xhci_input_dev_ctx *input_ctx;
+ struct xhci_slot_ctx *islot_ctx;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep0_ctx;
+ uint32_t cmderr;
+
+ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+ islot_ctx = &input_ctx->ctx_slot;
+ ep0_ctx = &input_ctx->ctx_ep[1];
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+ DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,\r\n"
+ " slot %08x %08x %08x %08x\r\n"
+ " ep0 %08x %08x %016lx %08x\r\n",
+ input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+ islot_ctx->dwSctx0, islot_ctx->dwSctx1,
+ islot_ctx->dwSctx2, islot_ctx->dwSctx3,
+ ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+ ep0_ctx->dwEpCtx4));
+
+ /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */
+ if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
+ (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) {
+ DPRINTF(("pci_xhci: eval ctx, input ctl invalid\r\n"));
+ cmderr = XHCI_TRB_ERROR_TRB;
+ goto done;
+ }
+
+ /* assign address to slot; in this emulation, slot_id = address */
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+ DPRINTF(("pci_xhci: eval ctx, dev ctx\r\n"
+ " slot %08x %08x %08x %08x\r\n",
+ dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+ dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+ if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */
+ /* set max exit latency */
+ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY(
+ dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1,
+ 0xFFFF, 0);
+
+ /* set interrupter target */
+ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY(
+ dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2,
+ 0x3FF, 22);
+ }
+ if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */
+ /* set max packet size */
+ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY(
+ dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1,
+ 0xFFFF, 16);
+
+ ep0_ctx = &dev_ctx->ctx_ep[1];
+ }
+
+ DPRINTF(("pci_xhci: eval ctx, output ctx\r\n"
+ " slot %08x %08x %08x %08x\r\n"
+ " ep0 %08x %08x %016lx %08x\r\n",
+ dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+ dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3,
+ ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+ ep0_ctx->dwEpCtx4));
+
+done:
+ return (cmderr);
+}
+
+static int
+pci_xhci_complete_commands(struct pci_xhci_softc *sc)
+{
+ struct xhci_trb evtrb;
+ struct xhci_trb *trb;
+ uint64_t crcr;
+ uint32_t ccs; /* cycle state (XHCI 4.9.2) */
+ uint32_t type;
+ uint32_t slot;
+ uint32_t cmderr;
+ int error;
+
+ error = 0;
+ sc->opregs.crcr |= XHCI_CRCR_LO_CRR;
+
+ trb = sc->opregs.cr_p;
+ ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS;
+ crcr = sc->opregs.crcr & ~0xF;
+
+ while (1) {
+ sc->opregs.cr_p = trb;
+
+ type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+
+ if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) !=
+ (ccs & XHCI_TRB_3_CYCLE_BIT))
+ break;
+
+ DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x"
+ " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u\r\n",
+ type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3,
+ trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs));
+
+ cmderr = XHCI_TRB_ERROR_SUCCESS;
+ evtrb.dwTrb2 = 0;
+ evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) |
+ XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE);
+ slot = 0;
+
+ switch (type) {
+ case XHCI_TRB_TYPE_LINK: /* 0x06 */
+ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
+ ccs ^= XHCI_CRCR_LO_RCS;
+ break;
+
+ case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */
+ cmderr = pci_xhci_cmd_enable_slot(sc, &slot);
+ break;
+
+ case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_disable_slot(sc, slot);
+ break;
+
+ case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_address_device(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_config_ep(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */
+ DPRINTF(("Reset Endpoint on slot %d\r\n", slot));
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */
+ DPRINTF(("Stop Endpoint on slot %d\r\n", slot));
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_set_tr(sc, slot, trb);
+ break;
+
+ case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */
+ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+ cmderr = pci_xhci_cmd_reset_device(sc, slot);
+ break;
+
+ case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */
+ /* TODO: */
+ break;
+
+ case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */
+ break;
+
+ case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */
+ break;
+
+ case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */
+ break;
+
+ case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */
+ break;
+
+ case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */
+ break;
+
+ default:
+ DPRINTF(("pci_xhci: unsupported cmd %x\r\n", type));
+ break;
+ }
+
+ if (type != XHCI_TRB_TYPE_LINK) {
+ /*
+ * insert command completion event and assert intr
+ */
+ evtrb.qwTrb0 = crcr;
+ evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr);
+ evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot);
+ DPRINTF(("pci_xhci: command 0x%x result: 0x%x\r\n",
+ type, cmderr));
+ pci_xhci_insert_event(sc, &evtrb, 1);
+ }
+
+ trb = pci_xhci_trb_next(sc, trb, &crcr);
+ }
+
+ sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs;
+ sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR;
+ return (error);
+}
+
+static void
+pci_xhci_dump_trb(struct xhci_trb *trb)
+{
+ static const char *trbtypes[] = {
+ "RESERVED",
+ "NORMAL",
+ "SETUP_STAGE",
+ "DATA_STAGE",
+ "STATUS_STAGE",
+ "ISOCH",
+ "LINK",
+ "EVENT_DATA",
+ "NOOP",
+ "ENABLE_SLOT",
+ "DISABLE_SLOT",
+ "ADDRESS_DEVICE",
+ "CONFIGURE_EP",
+ "EVALUATE_CTX",
+ "RESET_EP",
+ "STOP_EP",
+ "SET_TR_DEQUEUE",
+ "RESET_DEVICE",
+ "FORCE_EVENT",
+ "NEGOTIATE_BW",
+ "SET_LATENCY_TOL",
+ "GET_PORT_BW",
+ "FORCE_HEADER",
+ "NOOP_CMD"
+ };
+ uint32_t type;
+
+ type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+ DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x\r\n",
+ trb, type,
+ type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID",
+ trb->qwTrb0, trb->dwTrb2, trb->dwTrb3));
+}
+
+static int
+pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer,
+ uint32_t slot, uint32_t epid, int *do_intr)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx;
+ struct xhci_trb *trb;
+ struct xhci_trb evtrb;
+ uint32_t trbflags;
+ uint32_t edtla;
+ int i, err;
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ devep = &dev->eps[epid];
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+ assert(dev_ctx != NULL);
+
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+
+ err = XHCI_TRB_ERROR_SUCCESS;
+ *do_intr = 0;
+ edtla = 0;
+
+ /* go through list of TRBs and insert event(s) */
+ for (i = xfer->head; xfer->ndata > 0; ) {
+ evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data;
+ trb = XHCI_GADDR(sc, evtrb.qwTrb0);
+ trbflags = trb->dwTrb3;
+
+ DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x "
+ "(err %d) IOC?%d\r\n",
+ i, xfer->data[i].processed, xfer->data[i].blen,
+ XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0,
+ trbflags, err,
+ trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0));
+
+ if (!xfer->data[i].processed) {
+ xfer->head = i;
+ break;
+ }
+
+ xfer->ndata--;
+ edtla += xfer->data[i].bdone;
+
+ trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs);
+
+ pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx,
+ xfer->data[i].streamid, xfer->data[i].trbnext,
+ xfer->data[i].ccs);
+
+ /* Only interrupt if IOC or short packet */
+ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) &&
+ !((err == XHCI_TRB_ERROR_SHORT_PKT) &&
+ (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) {
+
+ i = (i + 1) % USB_MAX_XFER_BLOCKS;
+ continue;
+ }
+
+ evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) |
+ XHCI_TRB_2_REM_SET(xfer->data[i].blen);
+
+ evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) |
+ XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid);
+
+ if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) {
+ DPRINTF(("pci_xhci EVENT_DATA edtla %u\r\n", edtla));
+ evtrb.qwTrb0 = trb->qwTrb0;
+ evtrb.dwTrb2 = (edtla & 0xFFFFF) |
+ XHCI_TRB_2_ERROR_SET(err);
+ evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT;
+ edtla = 0;
+ }
+
+ *do_intr = 1;
+
+ err = pci_xhci_insert_event(sc, &evtrb, 0);
+ if (err != XHCI_TRB_ERROR_SUCCESS) {
+ break;
+ }
+
+ i = (i + 1) % USB_MAX_XFER_BLOCKS;
+ }
+
+ return (err);
+}
+
+static void
+pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
+ struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx,
+ uint32_t streamid, uint64_t ringaddr, int ccs)
+{
+
+ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
+ devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) |
+ (ccs & 0x1);
+
+ devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL;
+ devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1;
+ ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1);
+
+ DPRINTF(("xhci update ep-ring stream %d, addr %lx\r\n",
+ streamid, devep->ep_sctx[streamid].qwSctx0));
+ } else {
+ devep->ep_ringaddr = ringaddr & ~0xFUL;
+ devep->ep_ccs = ccs & 0x1;
+ devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL);
+ ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1);
+
+ DPRINTF(("xhci update ep-ring, addr %lx\r\n",
+ (devep->ep_ringaddr | devep->ep_ccs)));
+ }
+}
+
+/*
+ * Outstanding transfer still in progress (device NAK'd earlier) so retry
+ * the transfer again to see if it succeeds.
+ */
+static int
+pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc,
+ struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+ struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid)
+{
+ struct usb_data_xfer *xfer;
+ int err;
+ int do_intr;
+
+ ep_ctx->dwEpCtx0 = FIELD_REPLACE(
+ ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+
+ err = 0;
+ do_intr = 0;
+
+ xfer = devep->ep_xfer;
+#ifdef __FreeBSD__
+ USB_DATA_XFER_LOCK(xfer);
+#else
+ /*
+ * At least one caller needs to hold this lock across the call to this
+ * function and other code. To avoid deadlock from a recursive mutex
+ * enter, we ensure that all callers hold this lock.
+ */
+ assert(USB_DATA_XFER_LOCK_HELD(xfer));
+#endif
+
+ /* outstanding requests queued up */
+ if (dev->dev_ue->ue_data != NULL) {
+ err = dev->dev_ue->ue_data(dev->dev_sc, xfer,
+ epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2);
+ if (err == USB_ERR_CANCELLED) {
+ if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) ==
+ USB_NAK)
+ err = XHCI_TRB_ERROR_SUCCESS;
+ } else {
+ err = pci_xhci_xfer_complete(sc, xfer, slot, epid,
+ &do_intr);
+ if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) {
+ pci_xhci_assert_interrupt(sc);
+ }
+
+
+ /* XXX should not do it if error? */
+ USB_DATA_XFER_RESET(xfer);
+ }
+ }
+
+#ifdef __FreeBSD__
+ USB_DATA_XFER_UNLOCK(xfer);
+#endif
+
+ return (err);
+}
+
+
+static int
+pci_xhci_handle_transfer(struct pci_xhci_softc *sc,
+ struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+ struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot,
+ uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid)
+{
+ struct xhci_trb *setup_trb;
+ struct usb_data_xfer *xfer;
+ struct usb_data_xfer_block *xfer_block;
+ uint64_t val;
+ uint32_t trbflags;
+ int do_intr, err;
+ int do_retry;
+
+ ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0,
+ XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+
+ xfer = devep->ep_xfer;
+ USB_DATA_XFER_LOCK(xfer);
+
+ DPRINTF(("pci_xhci handle_transfer slot %u\r\n", slot));
+
+retry:
+ err = 0;
+ do_retry = 0;
+ do_intr = 0;
+ setup_trb = NULL;
+
+ while (1) {
+ pci_xhci_dump_trb(trb);
+
+ trbflags = trb->dwTrb3;
+
+ if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK &&
+ (trbflags & XHCI_TRB_3_CYCLE_BIT) !=
+ (ccs & XHCI_TRB_3_CYCLE_BIT)) {
+ DPRINTF(("Cycle-bit changed trbflags %x, ccs %x\r\n",
+ trbflags & XHCI_TRB_3_CYCLE_BIT, ccs));
+ break;
+ }
+
+ xfer_block = NULL;
+
+ switch (XHCI_TRB_3_TYPE_GET(trbflags)) {
+ case XHCI_TRB_TYPE_LINK:
+ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
+ ccs ^= 0x1;
+
+ xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+ (void *)addr, ccs);
+ xfer_block->processed = 1;
+ break;
+
+ case XHCI_TRB_TYPE_SETUP_STAGE:
+ if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 ||
+ XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) {
+ DPRINTF(("pci_xhci: invalid setup trb\r\n"));
+ err = XHCI_TRB_ERROR_TRB;
+ goto errout;
+ }
+ setup_trb = trb;
+
+ val = trb->qwTrb0;
+ if (!xfer->ureq)
+ xfer->ureq = malloc(
+ sizeof(struct usb_device_request));
+ memcpy(xfer->ureq, &val,
+ sizeof(struct usb_device_request));
+
+ xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+ (void *)addr, ccs);
+ xfer_block->processed = 1;
+ break;
+
+ case XHCI_TRB_TYPE_NORMAL:
+ case XHCI_TRB_TYPE_ISOCH:
+ if (setup_trb != NULL) {
+ DPRINTF(("pci_xhci: trb not supposed to be in "
+ "ctl scope\r\n"));
+ err = XHCI_TRB_ERROR_TRB;
+ goto errout;
+ }
+ /* fall through */
+
+ case XHCI_TRB_TYPE_DATA_STAGE:
+ xfer_block = usb_data_xfer_append(xfer,
+ (void *)(trbflags & XHCI_TRB_3_IDT_BIT ?
+ &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)),
+ trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs);
+ break;
+
+ case XHCI_TRB_TYPE_STATUS_STAGE:
+ xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+ (void *)addr, ccs);
+ break;
+
+ case XHCI_TRB_TYPE_NOOP:
+ xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+ (void *)addr, ccs);
+ xfer_block->processed = 1;
+ break;
+
+ case XHCI_TRB_TYPE_EVENT_DATA:
+ xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+ (void *)addr, ccs);
+ if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) {
+ xfer_block->processed = 1;
+ }
+ break;
+
+ default:
+ DPRINTF(("pci_xhci: handle xfer unexpected trb type "
+ "0x%x\r\n",
+ XHCI_TRB_3_TYPE_GET(trbflags)));
+ err = XHCI_TRB_ERROR_TRB;
+ goto errout;
+ }
+
+ trb = pci_xhci_trb_next(sc, trb, &addr);
+
+ DPRINTF(("pci_xhci: next trb: 0x%lx\r\n", (uint64_t)trb));
+
+ if (xfer_block) {
+ xfer_block->trbnext = addr;
+ xfer_block->streamid = streamid;
+ }
+
+ if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) &&
+ XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) {
+ break;
+ }
+
+ /* handle current batch that requires interrupt on complete */
+ if (trbflags & XHCI_TRB_3_IOC_BIT) {
+ DPRINTF(("pci_xhci: trb IOC bit set\r\n"));
+ if (epid == 1)
+ do_retry = 1;
+ break;
+ }
+ }
+
+ DPRINTF(("pci_xhci[%d]: xfer->ndata %u\r\n", __LINE__, xfer->ndata));
+
+ if (epid == 1) {
+ err = USB_ERR_NOT_STARTED;
+ if (dev->dev_ue->ue_request != NULL)
+ err = dev->dev_ue->ue_request(dev->dev_sc, xfer);
+ setup_trb = NULL;
+ } else {
+ /* handle data transfer */
+ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
+ err = XHCI_TRB_ERROR_SUCCESS;
+ goto errout;
+ }
+
+ err = USB_TO_XHCI_ERR(err);
+ if ((err == XHCI_TRB_ERROR_SUCCESS) ||
+ (err == XHCI_TRB_ERROR_SHORT_PKT)) {
+ err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr);
+ if (err != XHCI_TRB_ERROR_SUCCESS)
+ do_retry = 0;
+ }
+
+errout:
+ if (err == XHCI_TRB_ERROR_EV_RING_FULL)
+ DPRINTF(("pci_xhci[%d]: event ring full\r\n", __LINE__));
+
+ if (!do_retry)
+ USB_DATA_XFER_UNLOCK(xfer);
+
+ if (do_intr)
+ pci_xhci_assert_interrupt(sc);
+
+ if (do_retry) {
+ USB_DATA_XFER_RESET(xfer);
+ DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs\r\n",
+ __LINE__));
+ goto retry;
+ }
+
+ if (epid == 1)
+ USB_DATA_XFER_RESET(xfer);
+
+ return (err);
+}
+
+static void
+pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
+ uint32_t epid, uint32_t streamid)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct pci_xhci_dev_ep *devep;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_endp_ctx *ep_ctx;
+ struct pci_xhci_trb_ring *sctx_tr;
+ struct xhci_trb *trb;
+ uint64_t ringaddr;
+ uint32_t ccs;
+
+ DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u\r\n",
+ slot, epid, streamid));
+
+ if (slot == 0 || slot > sc->ndevices) {
+ DPRINTF(("pci_xhci: invalid doorbell slot %u\r\n", slot));
+ return;
+ }
+
+ dev = XHCI_SLOTDEV_PTR(sc, slot);
+ devep = &dev->eps[epid];
+ dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+ if (!dev_ctx) {
+ return;
+ }
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+
+ sctx_tr = NULL;
+
+ DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x\r\n",
+ epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
+ ep_ctx->dwEpCtx4));
+
+ if (ep_ctx->qwEpCtx2 == 0)
+ return;
+
+ /* handle pending transfers */
+ if (devep->ep_xfer->ndata > 0) {
+#ifndef __FreeBSD__
+ USB_DATA_XFER_LOCK(devep->ep_xfer);
+#endif
+ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
+#ifndef __FreeBSD__
+ USB_DATA_XFER_UNLOCK(devep->ep_xfer);
+#endif
+ return;
+ }
+
+ /* get next trb work item */
+ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
+ sctx_tr = &devep->ep_sctx_trbs[streamid];
+ ringaddr = sctx_tr->ringaddr;
+ ccs = sctx_tr->ccs;
+ trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL);
+ DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x\r\n",
+ streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
+ trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
+ } else {
+ ringaddr = devep->ep_ringaddr;
+ ccs = devep->ep_ccs;
+ trb = devep->ep_tr;
+ DPRINTF(("doorbell, ccs %lx, trb ccs %x\r\n",
+ ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
+ trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
+ }
+
+ if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) {
+ DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?\r\n",
+ ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid));
+ return;
+ }
+
+ pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid,
+ ringaddr, ccs, streamid);
+}
+
+static void
+pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+ uint64_t value)
+{
+
+ offset = (offset - sc->dboff) / sizeof(uint32_t);
+
+ DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx\r\n",
+ offset, value));
+
+ if (XHCI_HALTED(sc)) {
+ DPRINTF(("pci_xhci: controller halted\r\n"));
+ return;
+ }
+
+ if (offset == 0)
+ pci_xhci_complete_commands(sc);
+ else if (sc->portregs != NULL)
+ pci_xhci_device_doorbell(sc, offset,
+ XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value));
+}
+
+static void
+pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+ uint64_t value)
+{
+ struct pci_xhci_rtsregs *rts;
+
+ offset -= sc->rtsoff;
+
+ if (offset == 0) {
+ DPRINTF(("pci_xhci attempted write to MFINDEX\r\n"));
+ return;
+ }
+
+ DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx\r\n",
+ offset, value));
+
+ offset -= 0x20; /* start of intrreg */
+
+ rts = &sc->rtsregs;
+
+ switch (offset) {
+ case 0x00:
+ if (value & XHCI_IMAN_INTR_PEND)
+ rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
+ rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) |
+ (rts->intrreg.iman & XHCI_IMAN_INTR_PEND);
+
+ if (!(value & XHCI_IMAN_INTR_ENA))
+ pci_xhci_deassert_interrupt(sc);
+
+ break;
+
+ case 0x04:
+ rts->intrreg.imod = value;
+ break;
+
+ case 0x08:
+ rts->intrreg.erstsz = value & 0xFFFF;
+ break;
+
+ case 0x10:
+ /* ERSTBA low bits */
+ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) |
+ (value & ~0x3F);
+ break;
+
+ case 0x14:
+ /* ERSTBA high bits */
+ rts->intrreg.erstba = (value << 32) |
+ MASK_64_LO(sc->rtsregs.intrreg.erstba);
+
+ rts->erstba_p = XHCI_GADDR(sc,
+ sc->rtsregs.intrreg.erstba & ~0x3FUL);
+
+ rts->erst_p = XHCI_GADDR(sc,
+ sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL);
+
+ rts->er_enq_idx = 0;
+ rts->er_events_cnt = 0;
+
+ DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u\r\n",
+ rts->erstba_p,
+ rts->erstba_p->qwEvrsTablePtr,
+ rts->erstba_p->dwEvrsTableSize));
+ break;
+
+ case 0x18:
+ /* ERDP low bits */
+ rts->intrreg.erdp =
+ MASK_64_HI(sc->rtsregs.intrreg.erdp) |
+ (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) |
+ (value & ~0xF);
+ if (value & XHCI_ERDP_LO_BUSY) {
+ rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY;
+ rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
+ }
+
+ rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value);
+
+ break;
+
+ case 0x1C:
+ /* ERDP high bits */
+ rts->intrreg.erdp = (value << 32) |
+ MASK_64_LO(sc->rtsregs.intrreg.erdp);
+
+ if (rts->er_events_cnt > 0) {
+ uint64_t erdp;
+ uint32_t erdp_i;
+
+ erdp = rts->intrreg.erdp & ~0xF;
+ erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) /
+ sizeof(struct xhci_trb);
+
+ if (erdp_i <= rts->er_enq_idx)
+ rts->er_events_cnt = rts->er_enq_idx - erdp_i;
+ else
+ rts->er_events_cnt =
+ rts->erstba_p->dwEvrsTableSize -
+ (erdp_i - rts->er_enq_idx);
+
+ DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u\r\n",
+ erdp, rts->er_events_cnt));
+ }
+
+ break;
+
+ default:
+ DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx\r\n",
+ offset));
+ break;
+ }
+}
+
+static uint64_t
+pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+ int port;
+ uint32_t *p;
+
+ if (sc->portregs == NULL)
+ return (0);
+
+ port = (offset - 0x3F0) / 0x10;
+
+ if (port > XHCI_MAX_DEVS) {
+ DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS\r\n",
+ port));
+
+ /* return default value for unused port */
+ return (XHCI_PS_SPEED_SET(3));
+ }
+
+ offset = (offset - 0x3F0) % 0x10;
+
+ p = &sc->portregs[port].portsc;
+ p += offset / sizeof(uint32_t);
+
+ DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x\r\n",
+ offset, port, *p));
+
+ return (*p);
+}
+
+static void
+pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset,
+ uint64_t value)
+{
+ offset -= XHCI_CAPLEN;
+
+ if (offset < 0x400)
+ DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx\r\n",
+ offset, value));
+
+ switch (offset) {
+ case XHCI_USBCMD:
+ sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F);
+ break;
+
+ case XHCI_USBSTS:
+ /* clear bits on write */
+ sc->opregs.usbsts &= ~(value &
+ (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS|
+ XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR));
+ break;
+
+ case XHCI_PAGESIZE:
+ /* read only */
+ break;
+
+ case XHCI_DNCTRL:
+ sc->opregs.dnctrl = value & 0xFFFF;
+ break;
+
+ case XHCI_CRCR_LO:
+ if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) {
+ sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
+ sc->opregs.crcr |= value &
+ (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
+ } else {
+ sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) |
+ (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS));
+ }
+ break;
+
+ case XHCI_CRCR_HI:
+ if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) {
+ sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) |
+ (value << 32);
+
+ sc->opregs.cr_p = XHCI_GADDR(sc,
+ sc->opregs.crcr & ~0xF);
+ }
+
+ if (sc->opregs.crcr & XHCI_CRCR_LO_CS) {
+ /* Stop operation of Command Ring */
+ }
+
+ if (sc->opregs.crcr & XHCI_CRCR_LO_CA) {
+ /* Abort command */
+ }
+
+ break;
+
+ case XHCI_DCBAAP_LO:
+ sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) |
+ (value & 0xFFFFFFC0);
+ break;
+
+ case XHCI_DCBAAP_HI:
+ sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) |
+ (value << 32);
+ sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL);
+
+ DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)\r\n",
+ sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p));
+ break;
+
+ case XHCI_CONFIG:
+ sc->opregs.config = value & 0x03FF;
+ break;
+
+ default:
+ if (offset >= 0x400)
+ pci_xhci_portregs_write(sc, offset, value);
+
+ break;
+ }
+}
+
+
+static void
+pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_xhci_softc *sc;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+
+
+ pthread_mutex_lock(&sc->mtx);
+ if (offset < XHCI_CAPLEN) /* read only registers */
+ WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset));
+ else if (offset < sc->dboff)
+ pci_xhci_hostop_write(sc, offset, value);
+ else if (offset < sc->rtsoff)
+ pci_xhci_dbregs_write(sc, offset, value);
+ else if (offset < sc->regsend)
+ pci_xhci_rtsregs_write(sc, offset, value);
+ else
+ WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset));
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static uint64_t
+pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+ uint64_t value;
+
+ switch (offset) {
+ case XHCI_CAPLENGTH: /* 0x00 */
+ value = sc->caplength;
+ break;
+
+ case XHCI_HCSPARAMS1: /* 0x04 */
+ value = sc->hcsparams1;
+ break;
+
+ case XHCI_HCSPARAMS2: /* 0x08 */
+ value = sc->hcsparams2;
+ break;
+
+ case XHCI_HCSPARAMS3: /* 0x0C */
+ value = sc->hcsparams3;
+ break;
+
+ case XHCI_HCSPARAMS0: /* 0x10 */
+ value = sc->hccparams1;
+ break;
+
+ case XHCI_DBOFF: /* 0x14 */
+ value = sc->dboff;
+ break;
+
+ case XHCI_RTSOFF: /* 0x18 */
+ value = sc->rtsoff;
+ break;
+
+ case XHCI_HCCPRAMS2: /* 0x1C */
+ value = sc->hccparams2;
+ break;
+
+ default:
+ value = 0;
+ break;
+ }
+
+ DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx\r\n",
+ offset, value));
+
+ return (value);
+}
+
+static uint64_t
+pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+ uint64_t value;
+
+ offset = (offset - XHCI_CAPLEN);
+
+ switch (offset) {
+ case XHCI_USBCMD: /* 0x00 */
+ value = sc->opregs.usbcmd;
+ break;
+
+ case XHCI_USBSTS: /* 0x04 */
+ value = sc->opregs.usbsts;
+ break;
+
+ case XHCI_PAGESIZE: /* 0x08 */
+ value = sc->opregs.pgsz;
+ break;
+
+ case XHCI_DNCTRL: /* 0x14 */
+ value = sc->opregs.dnctrl;
+ break;
+
+ case XHCI_CRCR_LO: /* 0x18 */
+ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR;
+ break;
+
+ case XHCI_CRCR_HI: /* 0x1C */
+ value = 0;
+ break;
+
+ case XHCI_DCBAAP_LO: /* 0x30 */
+ value = sc->opregs.dcbaap & 0xFFFFFFFF;
+ break;
+
+ case XHCI_DCBAAP_HI: /* 0x34 */
+ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF;
+ break;
+
+ case XHCI_CONFIG: /* 0x38 */
+ value = sc->opregs.config;
+ break;
+
+ default:
+ if (offset >= 0x400)
+ value = pci_xhci_portregs_read(sc, offset);
+ else
+ value = 0;
+
+ break;
+ }
+
+ if (offset < 0x400)
+ DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx\r\n",
+ offset, value));
+
+ return (value);
+}
+
+static uint64_t
+pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+
+ /* read doorbell always returns 0 */
+ return (0);
+}
+
+static uint64_t
+pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+
+ offset -= sc->rtsoff;
+ value = 0;
+
+ if (offset == XHCI_MFINDEX) {
+ value = sc->rtsregs.mfindex;
+ } else if (offset >= 0x20) {
+ int item;
+ uint32_t *p;
+
+ offset -= 0x20;
+ item = offset % 32;
+
+ assert(offset < sizeof(sc->rtsregs.intrreg));
+
+ p = &sc->rtsregs.intrreg.iman;
+ p += item / sizeof(uint32_t);
+ value = *p;
+ }
+
+ DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x\r\n",
+ offset, value));
+
+ return (value);
+}
+
+static uint64_t
+pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+
+ offset -= sc->regsend;
+ value = 0;
+
+ switch (offset) {
+ case 0:
+ /* rev major | rev minor | next-cap | cap-id */
+ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS;
+ break;
+ case 4:
+ /* name string = "USB" */
+ value = 0x20425355;
+ break;
+ case 8:
+ /* psic | proto-defined | compat # | compat offset */
+ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start;
+ break;
+ case 12:
+ break;
+ case 16:
+ /* rev major | rev minor | next-cap | cap-id */
+ value = (0x03 << 24) | XHCI_ID_PROTOCOLS;
+ break;
+ case 20:
+ /* name string = "USB" */
+ value = 0x20425355;
+ break;
+ case 24:
+ /* psic | proto-defined | compat # | compat offset */
+ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start;
+ break;
+ case 28:
+ break;
+ default:
+ DPRINTF(("pci_xhci: xecp invalid offset 0x%lx\r\n", offset));
+ break;
+ }
+
+ DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x\r\n",
+ offset, value));
+
+ return (value);
+}
+
+
+static uint64_t
+pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_xhci_softc *sc;
+ uint32_t value;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+
+ pthread_mutex_lock(&sc->mtx);
+ if (offset < XHCI_CAPLEN)
+ value = pci_xhci_hostcap_read(sc, offset);
+ else if (offset < sc->dboff)
+ value = pci_xhci_hostop_read(sc, offset);
+ else if (offset < sc->rtsoff)
+ value = pci_xhci_dbregs_read(sc, offset);
+ else if (offset < sc->regsend)
+ value = pci_xhci_rtsregs_read(sc, offset);
+ else if (offset < (sc->regsend + 4*32))
+ value = pci_xhci_xecp_read(sc, offset);
+ else {
+ value = 0;
+ WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset));
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ switch (size) {
+ case 1:
+ value &= 0xFF;
+ break;
+ case 2:
+ value &= 0xFFFF;
+ break;
+ case 4:
+ value &= 0xFFFFFFFF;
+ break;
+ }
+
+ return (value);
+}
+
+static void
+pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm)
+{
+ struct pci_xhci_portregs *port;
+ struct pci_xhci_dev_emu *dev;
+ struct xhci_trb evtrb;
+ int error;
+
+ assert(portn <= XHCI_MAX_DEVS);
+
+ DPRINTF(("xhci reset port %d\r\n", portn));
+
+ port = XHCI_PORTREG_PTR(sc, portn);
+ dev = XHCI_DEVINST_PTR(sc, portn);
+ if (dev) {
+ port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC);
+ port->portsc |= XHCI_PS_PED |
+ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+
+ if (warm && dev->dev_ue->ue_usbver == 3) {
+ port->portsc |= XHCI_PS_WRC;
+ }
+
+ if ((port->portsc & XHCI_PS_PRC) == 0) {
+ port->portsc |= XHCI_PS_PRC;
+
+ pci_xhci_set_evtrb(&evtrb, portn,
+ XHCI_TRB_ERROR_SUCCESS,
+ XHCI_TRB_EVENT_PORT_STS_CHANGE);
+ error = pci_xhci_insert_event(sc, &evtrb, 1);
+ if (error != XHCI_TRB_ERROR_SUCCESS)
+ DPRINTF(("xhci reset port insert event "
+ "failed\r\n"));
+ }
+ }
+}
+
+static void
+pci_xhci_init_port(struct pci_xhci_softc *sc, int portn)
+{
+ struct pci_xhci_portregs *port;
+ struct pci_xhci_dev_emu *dev;
+
+ port = XHCI_PORTREG_PTR(sc, portn);
+ dev = XHCI_DEVINST_PTR(sc, portn);
+ if (dev) {
+ port->portsc = XHCI_PS_CCS | /* connected */
+ XHCI_PS_PP; /* port power */
+
+ if (dev->dev_ue->ue_usbver == 2) {
+ port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) |
+ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+ } else {
+ port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) |
+ XHCI_PS_PED | /* enabled */
+ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+ }
+
+ DPRINTF(("Init port %d 0x%x\n", portn, port->portsc));
+ } else {
+ port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP;
+ DPRINTF(("Init empty port %d 0x%x\n", portn, port->portsc));
+ }
+}
+
+static int
+pci_xhci_dev_intr(struct usb_hci *hci, int epctx)
+{
+ struct pci_xhci_dev_emu *dev;
+ struct xhci_dev_ctx *dev_ctx;
+ struct xhci_trb evtrb;
+ struct pci_xhci_softc *sc;
+ struct pci_xhci_portregs *p;
+ struct xhci_endp_ctx *ep_ctx;
+ int error;
+ int dir_in;
+ int epid;
+
+ dir_in = epctx & 0x80;
+ epid = epctx & ~0x80;
+
+ /* HW endpoint contexts are 0-15; convert to epid based on dir */
+ epid = (epid * 2) + (dir_in ? 1 : 0);
+
+ assert(epid >= 1 && epid <= 31);
+
+ dev = hci->hci_sc;
+ sc = dev->xsc;
+
+ /* check if device is ready; OS has to initialise it */
+ if (sc->rtsregs.erstba_p == NULL ||
+ (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 ||
+ dev->dev_ctx == NULL)
+ return (0);
+
+ p = XHCI_PORTREG_PTR(sc, hci->hci_port);
+
+ /* raise event if link U3 (suspended) state */
+ if (XHCI_PS_PLS_GET(p->portsc) == 3) {
+ p->portsc &= ~XHCI_PS_PLS_MASK;
+ p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME);
+ if ((p->portsc & XHCI_PS_PLC) != 0)
+ return (0);
+
+ p->portsc |= XHCI_PS_PLC;
+
+ pci_xhci_set_evtrb(&evtrb, hci->hci_port,
+ XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE);
+ error = pci_xhci_insert_event(sc, &evtrb, 0);
+ if (error != XHCI_TRB_ERROR_SUCCESS)
+ goto done;
+ }
+
+ dev_ctx = dev->dev_ctx;
+ ep_ctx = &dev_ctx->ctx_ep[epid];
+ if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) {
+ DPRINTF(("xhci device interrupt on disabled endpoint %d\r\n",
+ epid));
+ return (0);
+ }
+
+ DPRINTF(("xhci device interrupt on endpoint %d\r\n", epid));
+
+ pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0);
+
+done:
+ return (error);
+}
+
+static int
+pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param)
+{
+
+ DPRINTF(("xhci device event port %d\r\n", hci->hci_port));
+ return (0);
+}
+
+
+
+static void
+pci_xhci_device_usage(char *opt)
+{
+
+ fprintf(stderr, "Invalid USB emulation \"%s\"\r\n", opt);
+}
+
+static int
+pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts)
+{
+ struct pci_xhci_dev_emu **devices;
+ struct pci_xhci_dev_emu *dev;
+ struct usb_devemu *ue;
+ void *devsc;
+#ifdef __FreeBSD__
+ char *uopt, *xopts, *config;
+#else
+ char *uopt = NULL, *xopts, *config;
+#endif
+ int usb3_port, usb2_port, i;
+
+ usb3_port = sc->usb3_port_start - 1;
+ usb2_port = sc->usb2_port_start - 1;
+ devices = NULL;
+
+ if (opts == NULL)
+ goto portsfinal;
+
+ devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *));
+
+ sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *));
+ sc->devices = devices;
+ sc->ndevices = 0;
+
+ uopt = strdup(opts);
+ for (xopts = strtok(uopt, ",");
+ xopts != NULL;
+ xopts = strtok(NULL, ",")) {
+ if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) ||
+ usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) {
+ WPRINTF(("pci_xhci max number of USB 2 or 3 "
+ "devices reached, max %d\r\n", XHCI_MAX_DEVS/2));
+ usb2_port = usb3_port = -1;
+ goto done;
+ }
+
+ /* device[=<config>] */
+ if ((config = strchr(xopts, '=')) == NULL)
+ config = ""; /* no config */
+ else
+ *config++ = '\0';
+
+ ue = usb_emu_finddev(xopts);
+ if (ue == NULL) {
+ pci_xhci_device_usage(xopts);
+ DPRINTF(("pci_xhci device not found %s\r\n", xopts));
+ usb2_port = usb3_port = -1;
+ goto done;
+ }
+
+ DPRINTF(("pci_xhci adding device %s, opts \"%s\"\r\n",
+ xopts, config));
+
+ dev = calloc(1, sizeof(struct pci_xhci_dev_emu));
+ dev->xsc = sc;
+ dev->hci.hci_sc = dev;
+ dev->hci.hci_intr = pci_xhci_dev_intr;
+ dev->hci.hci_event = pci_xhci_dev_event;
+
+ if (ue->ue_usbver == 2) {
+ dev->hci.hci_port = usb2_port + 1;
+ devices[usb2_port] = dev;
+ usb2_port++;
+ } else {
+ dev->hci.hci_port = usb3_port + 1;
+ devices[usb3_port] = dev;
+ usb3_port++;
+ }
+
+ dev->hci.hci_address = 0;
+ devsc = ue->ue_init(&dev->hci, config);
+ if (devsc == NULL) {
+ pci_xhci_device_usage(xopts);
+ usb2_port = usb3_port = -1;
+ goto done;
+ }
+
+ dev->dev_ue = ue;
+ dev->dev_sc = devsc;
+
+ /* assign slot number to device */
+ sc->slots[sc->ndevices] = dev;
+
+ sc->ndevices++;
+ }
+#ifdef __FreeBSD__
+ if (uopt != NULL)
+ free(uopt);
+#endif
+
+portsfinal:
+ sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs));
+
+ if (sc->ndevices > 0) {
+ /* port and slot numbering start from 1 */
+ sc->devices--;
+ sc->portregs--;
+ sc->slots--;
+
+ for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+ pci_xhci_init_port(sc, i);
+ }
+ } else {
+ WPRINTF(("pci_xhci no USB devices configured\r\n"));
+ sc->ndevices = 1;
+ }
+
+done:
+ if (devices != NULL) {
+ if (usb2_port <= 0 && usb3_port <= 0) {
+ sc->devices = NULL;
+ for (i = 0; devices[i] != NULL; i++)
+ free(devices[i]);
+ sc->ndevices = -1;
+
+ free(devices);
+ }
+ }
+ free(uopt);
+ return (sc->ndevices);
+}
+
+static int
+pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_xhci_softc *sc;
+ int error;
+
+ if (xhci_in_use) {
+ WPRINTF(("pci_xhci controller already defined\r\n"));
+ return (-1);
+ }
+ xhci_in_use = 1;
+
+ sc = calloc(1, sizeof(struct pci_xhci_softc));
+ pi->pi_arg = sc;
+ sc->xsc_pi = pi;
+
+ sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1;
+ sc->usb3_port_start = 1;
+
+ /* discover devices */
+ error = pci_xhci_parse_opts(sc, opts);
+ if (error < 0)
+ goto done;
+ else
+ error = 0;
+
+ sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) |
+ XHCI_SET_HCIVERSION(0x0100);
+ sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) |
+ XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */
+ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS);
+ sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) |
+ XHCI_SET_HCSP2_IST(0x04);
+ sc->hcsparams3 = 0; /* no latency */
+ sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */
+ XHCI_SET_HCCP1_SPC(1) | /* short packet */
+ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX);
+ sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) |
+ XHCI_SET_HCCP2_U3C(1);
+ sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START +
+ XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs));
+
+ /* dboff must be 32-bit aligned */
+ if (sc->dboff & 0x3)
+ sc->dboff = (sc->dboff + 0x3) & ~0x3;
+
+ /* rtsoff must be 32-bytes aligned */
+ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32);
+ if (sc->rtsoff & 0x1F)
+ sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F;
+
+ DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x\r\n", sc->dboff,
+ sc->rtsoff));
+
+ sc->opregs.usbsts = XHCI_STS_HCH;
+ sc->opregs.pgsz = XHCI_PAGESIZE_4K;
+
+ pci_xhci_reset(sc);
+
+ sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */
+
+ /*
+ * Set extended capabilities pointer to be after regsend;
+ * value of xecp field is 32-bit offset.
+ */
+ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4);
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB);
+ pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI);
+ pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0);
+
+ pci_emul_add_msicap(pi, 1);
+
+ /* regsend + xecp registers */
+ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32);
+ DPRINTF(("pci_xhci pci_emu_alloc: %d\r\n", sc->regsend + 4*32));
+
+
+ pci_lintr_request(pi);
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+done:
+ if (error) {
+ free(sc);
+ }
+
+ return (error);
+}
+
+
+
+struct pci_devemu pci_de_xhci = {
+ .pe_emu = "xhci",
+ .pe_init = pci_xhci_init,
+ .pe_barwrite = pci_xhci_write,
+ .pe_barread = pci_xhci_read
+};
+PCI_EMUL_SET(pci_de_xhci);
diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h
new file mode 100644
index 0000000000..7502f9396a
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_xhci.h
@@ -0,0 +1,355 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_XHCI_H_
+#define _PCI_XHCI_H_
+
+#define PCI_USBREV 0x60 /* USB protocol revision */
+
+
+enum { /* dsc_slotstate */
+ XHCI_ST_DISABLED,
+ XHCI_ST_ENABLED,
+ XHCI_ST_DEFAULT,
+ XHCI_ST_ADDRESSED,
+ XHCI_ST_CONFIGURED,
+ XHCI_ST_MAX
+};
+
+enum {
+ XHCI_ST_SLCTX_DISABLED,
+ XHCI_ST_SLCTX_DEFAULT,
+ XHCI_ST_SLCTX_ADDRESSED,
+ XHCI_ST_SLCTX_CONFIGURED
+};
+
+enum {
+ XHCI_ST_EPCTX_DISABLED,
+ XHCI_ST_EPCTX_RUNNING,
+ XHCI_ST_EPCTX_HALTED,
+ XHCI_ST_EPCTX_STOPPED,
+ XHCI_ST_EPCTX_ERROR
+};
+
+#define XHCI_MAX_DEVICES MIN(USB_MAX_DEVICES, 128)
+#define XHCI_MAX_ENDPOINTS 32 /* hardcoded - do not change */
+#define XHCI_MAX_SCRATCHPADS 32
+#define XHCI_MAX_EVENTS (16 * 13)
+#define XHCI_MAX_COMMANDS (16 * 1)
+#define XHCI_MAX_RSEG 1
+#define XHCI_MAX_TRANSFERS 4
+#if USB_MAX_EP_STREAMS == 8
+#define XHCI_MAX_STREAMS 8
+#define XHCI_MAX_STREAMS_LOG 3
+#elif USB_MAX_EP_STREAMS == 1
+#define XHCI_MAX_STREAMS 1
+#define XHCI_MAX_STREAMS_LOG 0
+#else
+#error "The USB_MAX_EP_STREAMS value is not supported."
+#endif
+#define XHCI_DEV_CTX_ADDR_ALIGN 64 /* bytes */
+#define XHCI_DEV_CTX_ALIGN 64 /* bytes */
+#define XHCI_INPUT_CTX_ALIGN 64 /* bytes */
+#define XHCI_SLOT_CTX_ALIGN 32 /* bytes */
+#define XHCI_ENDP_CTX_ALIGN 32 /* bytes */
+#define XHCI_STREAM_CTX_ALIGN 16 /* bytes */
+#define XHCI_TRANS_RING_SEG_ALIGN 16 /* bytes */
+#define XHCI_CMD_RING_SEG_ALIGN 64 /* bytes */
+#define XHCI_EVENT_RING_SEG_ALIGN 64 /* bytes */
+#define XHCI_SCRATCH_BUF_ARRAY_ALIGN 64 /* bytes */
+#define XHCI_SCRATCH_BUFFER_ALIGN USB_PAGE_SIZE
+#define XHCI_TRB_ALIGN 16 /* bytes */
+#define XHCI_TD_ALIGN 64 /* bytes */
+#define XHCI_PAGE_SIZE 4096 /* bytes */
+
+struct xhci_slot_ctx {
+ volatile uint32_t dwSctx0;
+#define XHCI_SCTX_0_ROUTE_SET(x) ((x) & 0xFFFFF)
+#define XHCI_SCTX_0_ROUTE_GET(x) ((x) & 0xFFFFF)
+#define XHCI_SCTX_0_SPEED_SET(x) (((x) & 0xF) << 20)
+#define XHCI_SCTX_0_SPEED_GET(x) (((x) >> 20) & 0xF)
+#define XHCI_SCTX_0_MTT_SET(x) (((x) & 0x1) << 25)
+#define XHCI_SCTX_0_MTT_GET(x) (((x) >> 25) & 0x1)
+#define XHCI_SCTX_0_HUB_SET(x) (((x) & 0x1) << 26)
+#define XHCI_SCTX_0_HUB_GET(x) (((x) >> 26) & 0x1)
+#define XHCI_SCTX_0_CTX_NUM_SET(x) (((x) & 0x1F) << 27)
+#define XHCI_SCTX_0_CTX_NUM_GET(x) (((x) >> 27) & 0x1F)
+ volatile uint32_t dwSctx1;
+#define XHCI_SCTX_1_MAX_EL_SET(x) ((x) & 0xFFFF)
+#define XHCI_SCTX_1_MAX_EL_GET(x) ((x) & 0xFFFF)
+#define XHCI_SCTX_1_RH_PORT_SET(x) (((x) & 0xFF) << 16)
+#define XHCI_SCTX_1_RH_PORT_GET(x) (((x) >> 16) & 0xFF)
+#define XHCI_SCTX_1_NUM_PORTS_SET(x) (((x) & 0xFF) << 24)
+#define XHCI_SCTX_1_NUM_PORTS_GET(x) (((x) >> 24) & 0xFF)
+ volatile uint32_t dwSctx2;
+#define XHCI_SCTX_2_TT_HUB_SID_SET(x) ((x) & 0xFF)
+#define XHCI_SCTX_2_TT_HUB_SID_GET(x) ((x) & 0xFF)
+#define XHCI_SCTX_2_TT_PORT_NUM_SET(x) (((x) & 0xFF) << 8)
+#define XHCI_SCTX_2_TT_PORT_NUM_GET(x) (((x) >> 8) & 0xFF)
+#define XHCI_SCTX_2_TT_THINK_TIME_SET(x) (((x) & 0x3) << 16)
+#define XHCI_SCTX_2_TT_THINK_TIME_GET(x) (((x) >> 16) & 0x3)
+#define XHCI_SCTX_2_IRQ_TARGET_SET(x) (((x) & 0x3FF) << 22)
+#define XHCI_SCTX_2_IRQ_TARGET_GET(x) (((x) >> 22) & 0x3FF)
+ volatile uint32_t dwSctx3;
+#define XHCI_SCTX_3_DEV_ADDR_SET(x) ((x) & 0xFF)
+#define XHCI_SCTX_3_DEV_ADDR_GET(x) ((x) & 0xFF)
+#define XHCI_SCTX_3_SLOT_STATE_SET(x) (((x) & 0x1F) << 27)
+#define XHCI_SCTX_3_SLOT_STATE_GET(x) (((x) >> 27) & 0x1F)
+ volatile uint32_t dwSctx4;
+ volatile uint32_t dwSctx5;
+ volatile uint32_t dwSctx6;
+ volatile uint32_t dwSctx7;
+};
+
+struct xhci_endp_ctx {
+ volatile uint32_t dwEpCtx0;
+#define XHCI_EPCTX_0_EPSTATE_SET(x) ((x) & 0x7)
+#define XHCI_EPCTX_0_EPSTATE_GET(x) ((x) & 0x7)
+#define XHCI_EPCTX_0_MULT_SET(x) (((x) & 0x3) << 8)
+#define XHCI_EPCTX_0_MULT_GET(x) (((x) >> 8) & 0x3)
+#define XHCI_EPCTX_0_MAXP_STREAMS_SET(x) (((x) & 0x1F) << 10)
+#define XHCI_EPCTX_0_MAXP_STREAMS_GET(x) (((x) >> 10) & 0x1F)
+#define XHCI_EPCTX_0_LSA_SET(x) (((x) & 0x1) << 15)
+#define XHCI_EPCTX_0_LSA_GET(x) (((x) >> 15) & 0x1)
+#define XHCI_EPCTX_0_IVAL_SET(x) (((x) & 0xFF) << 16)
+#define XHCI_EPCTX_0_IVAL_GET(x) (((x) >> 16) & 0xFF)
+ volatile uint32_t dwEpCtx1;
+#define XHCI_EPCTX_1_CERR_SET(x) (((x) & 0x3) << 1)
+#define XHCI_EPCTX_1_CERR_GET(x) (((x) >> 1) & 0x3)
+#define XHCI_EPCTX_1_EPTYPE_SET(x) (((x) & 0x7) << 3)
+#define XHCI_EPCTX_1_EPTYPE_GET(x) (((x) >> 3) & 0x7)
+#define XHCI_EPCTX_1_HID_SET(x) (((x) & 0x1) << 7)
+#define XHCI_EPCTX_1_HID_GET(x) (((x) >> 7) & 0x1)
+#define XHCI_EPCTX_1_MAXB_SET(x) (((x) & 0xFF) << 8)
+#define XHCI_EPCTX_1_MAXB_GET(x) (((x) >> 8) & 0xFF)
+#define XHCI_EPCTX_1_MAXP_SIZE_SET(x) (((x) & 0xFFFF) << 16)
+#define XHCI_EPCTX_1_MAXP_SIZE_GET(x) (((x) >> 16) & 0xFFFF)
+ volatile uint64_t qwEpCtx2;
+#define XHCI_EPCTX_2_DCS_SET(x) ((x) & 0x1)
+#define XHCI_EPCTX_2_DCS_GET(x) ((x) & 0x1)
+#define XHCI_EPCTX_2_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U
+ volatile uint32_t dwEpCtx4;
+#define XHCI_EPCTX_4_AVG_TRB_LEN_SET(x) ((x) & 0xFFFF)
+#define XHCI_EPCTX_4_AVG_TRB_LEN_GET(x) ((x) & 0xFFFF)
+#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(x) (((x) & 0xFFFF) << 16)
+#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_GET(x) (((x) >> 16) & 0xFFFF)
+ volatile uint32_t dwEpCtx5;
+ volatile uint32_t dwEpCtx6;
+ volatile uint32_t dwEpCtx7;
+};
+
+struct xhci_input_ctx {
+#define XHCI_INCTX_NON_CTRL_MASK 0xFFFFFFFCU
+ volatile uint32_t dwInCtx0;
+#define XHCI_INCTX_0_DROP_MASK(n) (1U << (n))
+ volatile uint32_t dwInCtx1;
+#define XHCI_INCTX_1_ADD_MASK(n) (1U << (n))
+ volatile uint32_t dwInCtx2;
+ volatile uint32_t dwInCtx3;
+ volatile uint32_t dwInCtx4;
+ volatile uint32_t dwInCtx5;
+ volatile uint32_t dwInCtx6;
+ volatile uint32_t dwInCtx7;
+};
+
+struct xhci_input_dev_ctx {
+ struct xhci_input_ctx ctx_input;
+ union {
+ struct xhci_slot_ctx u_slot;
+ struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS];
+ } ctx_dev_slep;
+};
+
+struct xhci_dev_ctx {
+ union {
+ struct xhci_slot_ctx u_slot;
+ struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS];
+ } ctx_dev_slep;
+} __aligned(XHCI_DEV_CTX_ALIGN);
+#define ctx_slot ctx_dev_slep.u_slot
+#define ctx_ep ctx_dev_slep.u_ep
+
+struct xhci_stream_ctx {
+ volatile uint64_t qwSctx0;
+#define XHCI_SCTX_0_DCS_GET(x) ((x) & 0x1)
+#define XHCI_SCTX_0_DCS_SET(x) ((x) & 0x1)
+#define XHCI_SCTX_0_SCT_SET(x) (((x) & 0x7) << 1)
+#define XHCI_SCTX_0_SCT_GET(x) (((x) >> 1) & 0x7)
+#define XHCI_SCTX_0_SCT_SEC_TR_RING 0x0
+#define XHCI_SCTX_0_SCT_PRIM_TR_RING 0x1
+#define XHCI_SCTX_0_SCT_PRIM_SSA_8 0x2
+#define XHCI_SCTX_0_SCT_PRIM_SSA_16 0x3
+#define XHCI_SCTX_0_SCT_PRIM_SSA_32 0x4
+#define XHCI_SCTX_0_SCT_PRIM_SSA_64 0x5
+#define XHCI_SCTX_0_SCT_PRIM_SSA_128 0x6
+#define XHCI_SCTX_0_SCT_PRIM_SSA_256 0x7
+#define XHCI_SCTX_0_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U
+ volatile uint32_t dwSctx2;
+ volatile uint32_t dwSctx3;
+};
+
+struct xhci_trb {
+ volatile uint64_t qwTrb0;
+#define XHCI_TRB_0_DIR_IN_MASK (0x80ULL << 0)
+#define XHCI_TRB_0_WLENGTH_MASK (0xFFFFULL << 48)
+ volatile uint32_t dwTrb2;
+#define XHCI_TRB_2_ERROR_GET(x) (((x) >> 24) & 0xFF)
+#define XHCI_TRB_2_ERROR_SET(x) (((x) & 0xFF) << 24)
+#define XHCI_TRB_2_TDSZ_GET(x) (((x) >> 17) & 0x1F)
+#define XHCI_TRB_2_TDSZ_SET(x) (((x) & 0x1F) << 17)
+#define XHCI_TRB_2_REM_GET(x) ((x) & 0xFFFFFF)
+#define XHCI_TRB_2_REM_SET(x) ((x) & 0xFFFFFF)
+#define XHCI_TRB_2_BYTES_GET(x) ((x) & 0x1FFFF)
+#define XHCI_TRB_2_BYTES_SET(x) ((x) & 0x1FFFF)
+#define XHCI_TRB_2_IRQ_GET(x) (((x) >> 22) & 0x3FF)
+#define XHCI_TRB_2_IRQ_SET(x) (((x) & 0x3FF) << 22)
+#define XHCI_TRB_2_STREAM_GET(x) (((x) >> 16) & 0xFFFF)
+#define XHCI_TRB_2_STREAM_SET(x) (((x) & 0xFFFF) << 16)
+
+ volatile uint32_t dwTrb3;
+#define XHCI_TRB_3_TYPE_GET(x) (((x) >> 10) & 0x3F)
+#define XHCI_TRB_3_TYPE_SET(x) (((x) & 0x3F) << 10)
+#define XHCI_TRB_3_CYCLE_BIT (1U << 0)
+#define XHCI_TRB_3_TC_BIT (1U << 1) /* command ring only */
+#define XHCI_TRB_3_ENT_BIT (1U << 1) /* transfer ring only */
+#define XHCI_TRB_3_ISP_BIT (1U << 2)
+#define XHCI_TRB_3_ED_BIT (1U << 2)
+#define XHCI_TRB_3_NSNOOP_BIT (1U << 3)
+#define XHCI_TRB_3_CHAIN_BIT (1U << 4)
+#define XHCI_TRB_3_IOC_BIT (1U << 5)
+#define XHCI_TRB_3_IDT_BIT (1U << 6)
+#define XHCI_TRB_3_TBC_GET(x) (((x) >> 7) & 3)
+#define XHCI_TRB_3_TBC_SET(x) (((x) & 3) << 7)
+#define XHCI_TRB_3_BEI_BIT (1U << 9)
+#define XHCI_TRB_3_DCEP_BIT (1U << 9)
+#define XHCI_TRB_3_PRSV_BIT (1U << 9)
+#define XHCI_TRB_3_BSR_BIT (1U << 9)
+#define XHCI_TRB_3_TRT_MASK (3U << 16)
+#define XHCI_TRB_3_TRT_NONE (0U << 16)
+#define XHCI_TRB_3_TRT_OUT (2U << 16)
+#define XHCI_TRB_3_TRT_IN (3U << 16)
+#define XHCI_TRB_3_DIR_IN (1U << 16)
+#define XHCI_TRB_3_TLBPC_GET(x) (((x) >> 16) & 0xF)
+#define XHCI_TRB_3_TLBPC_SET(x) (((x) & 0xF) << 16)
+#define XHCI_TRB_3_EP_GET(x) (((x) >> 16) & 0x1F)
+#define XHCI_TRB_3_EP_SET(x) (((x) & 0x1F) << 16)
+#define XHCI_TRB_3_FRID_GET(x) (((x) >> 20) & 0x7FF)
+#define XHCI_TRB_3_FRID_SET(x) (((x) & 0x7FF) << 20)
+#define XHCI_TRB_3_ISO_SIA_BIT (1U << 31)
+#define XHCI_TRB_3_SUSP_EP_BIT (1U << 23)
+#define XHCI_TRB_3_SLOT_GET(x) (((x) >> 24) & 0xFF)
+#define XHCI_TRB_3_SLOT_SET(x) (((x) & 0xFF) << 24)
+
+/* Commands */
+#define XHCI_TRB_TYPE_RESERVED 0x00
+#define XHCI_TRB_TYPE_NORMAL 0x01
+#define XHCI_TRB_TYPE_SETUP_STAGE 0x02
+#define XHCI_TRB_TYPE_DATA_STAGE 0x03
+#define XHCI_TRB_TYPE_STATUS_STAGE 0x04
+#define XHCI_TRB_TYPE_ISOCH 0x05
+#define XHCI_TRB_TYPE_LINK 0x06
+#define XHCI_TRB_TYPE_EVENT_DATA 0x07
+#define XHCI_TRB_TYPE_NOOP 0x08
+#define XHCI_TRB_TYPE_ENABLE_SLOT 0x09
+#define XHCI_TRB_TYPE_DISABLE_SLOT 0x0A
+#define XHCI_TRB_TYPE_ADDRESS_DEVICE 0x0B
+#define XHCI_TRB_TYPE_CONFIGURE_EP 0x0C
+#define XHCI_TRB_TYPE_EVALUATE_CTX 0x0D
+#define XHCI_TRB_TYPE_RESET_EP 0x0E
+#define XHCI_TRB_TYPE_STOP_EP 0x0F
+#define XHCI_TRB_TYPE_SET_TR_DEQUEUE 0x10
+#define XHCI_TRB_TYPE_RESET_DEVICE 0x11
+#define XHCI_TRB_TYPE_FORCE_EVENT 0x12
+#define XHCI_TRB_TYPE_NEGOTIATE_BW 0x13
+#define XHCI_TRB_TYPE_SET_LATENCY_TOL 0x14
+#define XHCI_TRB_TYPE_GET_PORT_BW 0x15
+#define XHCI_TRB_TYPE_FORCE_HEADER 0x16
+#define XHCI_TRB_TYPE_NOOP_CMD 0x17
+
+/* Events */
+#define XHCI_TRB_EVENT_TRANSFER 0x20
+#define XHCI_TRB_EVENT_CMD_COMPLETE 0x21
+#define XHCI_TRB_EVENT_PORT_STS_CHANGE 0x22
+#define XHCI_TRB_EVENT_BW_REQUEST 0x23
+#define XHCI_TRB_EVENT_DOORBELL 0x24
+#define XHCI_TRB_EVENT_HOST_CTRL 0x25
+#define XHCI_TRB_EVENT_DEVICE_NOTIFY 0x26
+#define XHCI_TRB_EVENT_MFINDEX_WRAP 0x27
+
+/* Error codes */
+#define XHCI_TRB_ERROR_INVALID 0x00
+#define XHCI_TRB_ERROR_SUCCESS 0x01
+#define XHCI_TRB_ERROR_DATA_BUF 0x02
+#define XHCI_TRB_ERROR_BABBLE 0x03
+#define XHCI_TRB_ERROR_XACT 0x04
+#define XHCI_TRB_ERROR_TRB 0x05
+#define XHCI_TRB_ERROR_STALL 0x06
+#define XHCI_TRB_ERROR_RESOURCE 0x07
+#define XHCI_TRB_ERROR_BANDWIDTH 0x08
+#define XHCI_TRB_ERROR_NO_SLOTS 0x09
+#define XHCI_TRB_ERROR_STREAM_TYPE 0x0A
+#define XHCI_TRB_ERROR_SLOT_NOT_ON 0x0B
+#define XHCI_TRB_ERROR_ENDP_NOT_ON 0x0C
+#define XHCI_TRB_ERROR_SHORT_PKT 0x0D
+#define XHCI_TRB_ERROR_RING_UNDERRUN 0x0E
+#define XHCI_TRB_ERROR_RING_OVERRUN 0x0F
+#define XHCI_TRB_ERROR_VF_RING_FULL 0x10
+#define XHCI_TRB_ERROR_PARAMETER 0x11
+#define XHCI_TRB_ERROR_BW_OVERRUN 0x12
+#define XHCI_TRB_ERROR_CONTEXT_STATE 0x13
+#define XHCI_TRB_ERROR_NO_PING_RESP 0x14
+#define XHCI_TRB_ERROR_EV_RING_FULL 0x15
+#define XHCI_TRB_ERROR_INCOMPAT_DEV 0x16
+#define XHCI_TRB_ERROR_MISSED_SERVICE 0x17
+#define XHCI_TRB_ERROR_CMD_RING_STOP 0x18
+#define XHCI_TRB_ERROR_CMD_ABORTED 0x19
+#define XHCI_TRB_ERROR_STOPPED 0x1A
+#define XHCI_TRB_ERROR_LENGTH 0x1B
+#define XHCI_TRB_ERROR_BAD_MELAT 0x1D
+#define XHCI_TRB_ERROR_ISOC_OVERRUN 0x1F
+#define XHCI_TRB_ERROR_EVENT_LOST 0x20
+#define XHCI_TRB_ERROR_UNDEFINED 0x21
+#define XHCI_TRB_ERROR_INVALID_SID 0x22
+#define XHCI_TRB_ERROR_SEC_BW 0x23
+#define XHCI_TRB_ERROR_SPLIT_XACT 0x24
+} __aligned(4);
+
+struct xhci_dev_endpoint_trbs {
+ struct xhci_trb trb[(XHCI_MAX_STREAMS *
+ XHCI_MAX_TRANSFERS) + XHCI_MAX_STREAMS];
+};
+
+struct xhci_event_ring_seg {
+ volatile uint64_t qwEvrsTablePtr;
+ volatile uint32_t dwEvrsTableSize;
+ volatile uint32_t dwEvrsReserved;
+};
+
+#endif /* _PCI_XHCI_H_ */
diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c
new file mode 100644
index 0000000000..be188b79f2
--- /dev/null
+++ b/usr/src/cmd/bhyve/pm.c
@@ -0,0 +1,378 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Hudson River Trading LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <stdlib.h>
+#endif
+#include <signal.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
+#ifdef __FreeBSD__
+static struct mevent *power_button;
+static sig_t old_power_handler;
+#else
+struct vmctx *pwr_ctx;
+#endif
+
+/*
+ * Reset Control register at I/O port 0xcf9. Bit 2 forces a system
+ * reset when it transitions from 0 to 1. Bit 1 selects the type of
+ * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard"
+ * reset.
+ */
+static int
+reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int error;
+
+ static uint8_t reset_control;
+
+ if (bytes != 1)
+ return (-1);
+ if (in)
+ *eax = reset_control;
+ else {
+ reset_control = *eax;
+
+ /* Treat hard and soft resets the same. */
+ if (reset_control & 0x4) {
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
+ }
+ }
+ return (0);
+}
+INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
+
+/*
+ * ACPI's SCI is a level-triggered interrupt.
+ */
+static int sci_active;
+
+static void
+sci_assert(struct vmctx *ctx)
+{
+
+ if (sci_active)
+ return;
+ vm_isa_assert_irq(ctx, SCI_INT, SCI_INT);
+ sci_active = 1;
+}
+
+static void
+sci_deassert(struct vmctx *ctx)
+{
+
+ if (!sci_active)
+ return;
+ vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT);
+ sci_active = 0;
+}
+
+/*
+ * Power Management 1 Event Registers
+ *
+ * The only power management event supported is a power button upon
+ * receiving SIGTERM.
+ */
+static uint16_t pm1_enable, pm1_status;
+
+#define PM1_TMR_STS 0x0001
+#define PM1_BM_STS 0x0010
+#define PM1_GBL_STS 0x0020
+#define PM1_PWRBTN_STS 0x0100
+#define PM1_SLPBTN_STS 0x0200
+#define PM1_RTC_STS 0x0400
+#define PM1_WAK_STS 0x8000
+
+#define PM1_TMR_EN 0x0001
+#define PM1_GBL_EN 0x0020
+#define PM1_PWRBTN_EN 0x0100
+#define PM1_SLPBTN_EN 0x0200
+#define PM1_RTC_EN 0x0400
+
+static void
+sci_update(struct vmctx *ctx)
+{
+ int need_sci;
+
+ /* See if the SCI should be active or not. */
+ need_sci = 0;
+ if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
+ need_sci = 1;
+ if (need_sci)
+ sci_assert(ctx);
+ else
+ sci_deassert(ctx);
+}
+
+static int
+pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (bytes != 2)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ if (in)
+ *eax = pm1_status;
+ else {
+ /*
+ * Writes are only permitted to clear certain bits by
+ * writing 1 to those flags.
+ */
+ pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
+ PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+
+static int
+pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (bytes != 2)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ if (in)
+ *eax = pm1_enable;
+ else {
+ /*
+ * Only permit certain bits to be set. We never use
+ * the global lock, but ACPI-CA whines profusely if it
+ * can't set GBL_EN.
+ */
+ pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
+INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
+
+#ifdef __FreeBSD__
+static void
+power_button_handler(int signal, enum ev_type type, void *arg)
+{
+ struct vmctx *ctx;
+
+ ctx = arg;
+ pthread_mutex_lock(&pm_lock);
+ if (!(pm1_status & PM1_PWRBTN_STS)) {
+ pm1_status |= PM1_PWRBTN_STS;
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+}
+
+#else
+/*
+ * Initiate graceful power off.
+ */
+/*ARGSUSED*/
+static void
+power_button_handler(int signal, siginfo_t *type, void *cp)
+{
+ /*
+ * In theory, taking the 'pm_lock' mutex from within this signal
+ * handler could lead to deadlock if the main thread already held this
+ * mutex. In reality, this mutex is local to this file and all of the
+ * other usage in this file only occurs in functions which are FreeBSD
+ * specific (and thus currently not used). Thus, for consistency with
+ * the other code in this file, we take the mutex, but in the future,
+ * if these other functions are ever enabled for use on non-FreeBSD
+ * systems and these functions could be called directly by a thread
+ * (which would then hold the mutex), then we need to revisit the use
+ * of this mutex in this signal handler.
+ */
+ pthread_mutex_lock(&pm_lock);
+ if (!(pm1_status & PM1_PWRBTN_STS)) {
+ pm1_status |= PM1_PWRBTN_STS;
+ sci_update(pwr_ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+}
+#endif
+
+/*
+ * Power Management 1 Control Register
+ *
+ * This is mostly unimplemented except that we wish to handle writes that
+ * set SPL_EN to handle S5 (soft power off).
+ */
+static uint16_t pm1_control;
+
+#define PM1_SCI_EN 0x0001
+#define PM1_SLP_TYP 0x1c00
+#define PM1_SLP_EN 0x2000
+#define PM1_ALWAYS_ZERO 0xc003
+
+static int
+pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int error;
+
+ if (bytes != 2)
+ return (-1);
+ if (in)
+ *eax = pm1_control;
+ else {
+ /*
+ * Various bits are write-only or reserved, so force them
+ * to zero in pm1_control. Always preserve SCI_EN as OSPM
+ * can never change it.
+ */
+ pm1_control = (pm1_control & PM1_SCI_EN) |
+ (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
+
+ /*
+ * If SLP_EN is set, check for S5. Bhyve's _S5_ method
+ * says that '5' should be stored in SLP_TYP for S5.
+ */
+ if (*eax & PM1_SLP_EN) {
+ if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+ error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+ assert(error == 0 || errno == EALREADY);
+ }
+ }
+ }
+ return (0);
+}
+INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
+#ifdef __FreeBSD__
+SYSRES_IO(PM1A_EVT_ADDR, 8);
+#endif
+
+/*
+ * ACPI SMI Command Register
+ *
+ * This write-only register is used to enable and disable ACPI.
+ */
+static int
+smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ assert(!in);
+ if (bytes != 1)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ switch (*eax) {
+ case BHYVE_ACPI_ENABLE:
+ pm1_control |= PM1_SCI_EN;
+#ifdef __FreeBSD__
+ if (power_button == NULL) {
+ power_button = mevent_add(SIGTERM, EVF_SIGNAL,
+ power_button_handler, ctx);
+ old_power_handler = signal(SIGTERM, SIG_IGN);
+ }
+#endif
+ break;
+ case BHYVE_ACPI_DISABLE:
+ pm1_control &= ~PM1_SCI_EN;
+#ifdef __FreeBSD__
+ if (power_button != NULL) {
+ mevent_delete(power_button);
+ power_button = NULL;
+ signal(SIGTERM, old_power_handler);
+ }
+#endif
+ break;
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
+#ifdef __FreeBSD__
+SYSRES_IO(SMI_CMD, 1);
+#endif
+
+void
+sci_init(struct vmctx *ctx)
+{
+
+ /*
+ * Mark ACPI's SCI as level trigger and bump its use count
+ * in the PIRQ router.
+ */
+ pci_irq_use(SCI_INT);
+ vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
+
+#ifndef __FreeBSD__
+ {
+ /*
+ * Install SIGTERM signal handler for graceful power off.
+ */
+ struct sigaction act;
+
+ pwr_ctx = ctx;
+ act.sa_flags = 0;
+ act.sa_sigaction = power_button_handler;
+ (void) sigaction(SIGTERM, &act, NULL);
+ }
+#endif
+}
diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c
new file mode 100644
index 0000000000..d3040a8df7
--- /dev/null
+++ b/usr/src/cmd/bhyve/post.c
@@ -0,0 +1,55 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
+SYSRES_IO(0x84, 1);
diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c
new file mode 100644
index 0000000000..5453a26949
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.c
@@ -0,0 +1,383 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* keyboard device commands */
+#define PS2KC_RESET_DEV 0xff
+#define PS2KC_DISABLE 0xf5
+#define PS2KC_ENABLE 0xf4
+#define PS2KC_SET_TYPEMATIC 0xf3
+#define PS2KC_SEND_DEV_ID 0xf2
+#define PS2KC_SET_SCANCODE_SET 0xf0
+#define PS2KC_ECHO 0xee
+#define PS2KC_SET_LEDS 0xed
+
+#define PS2KC_BAT_SUCCESS 0xaa
+#define PS2KC_ACK 0xfa
+
+#define PS2KBD_FIFOSZ 16
+
+struct fifo {
+ uint8_t buf[PS2KBD_FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of bytes in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct ps2kbd_softc {
+ struct atkbdc_softc *atkbdc_sc;
+ pthread_mutex_t mtx;
+
+ bool enabled;
+ struct fifo fifo;
+
+ uint8_t curcmd; /* current command for next byte */
+};
+
+#define SCANCODE_E0_PREFIX 1
+struct extended_translation {
+ uint32_t keysym;
+ uint8_t scancode;
+ int flags;
+};
+
+/*
+ * FIXME: Pause/break and Print Screen/SysRq require special handling.
+ */
+static const struct extended_translation extended_translations[] = {
+ {0xff08, 0x66}, /* Back space */
+ {0xff09, 0x0d}, /* Tab */
+ {0xff0d, 0x5a}, /* Return */
+ {0xff1b, 0x76}, /* Escape */
+ {0xff50, 0x6c, SCANCODE_E0_PREFIX}, /* Home */
+ {0xff51, 0x6b, SCANCODE_E0_PREFIX}, /* Left arrow */
+ {0xff52, 0x75, SCANCODE_E0_PREFIX}, /* Up arrow */
+ {0xff53, 0x74, SCANCODE_E0_PREFIX}, /* Right arrow */
+ {0xff54, 0x72, SCANCODE_E0_PREFIX}, /* Down arrow */
+ {0xff55, 0x7d, SCANCODE_E0_PREFIX}, /* PgUp */
+ {0xff56, 0x7a, SCANCODE_E0_PREFIX}, /* PgDown */
+ {0xff57, 0x69, SCANCODE_E0_PREFIX}, /* End */
+ {0xff63, 0x70, SCANCODE_E0_PREFIX}, /* Ins */
+ {0xff8d, 0x5a, SCANCODE_E0_PREFIX}, /* Keypad Enter */
+ {0xffe1, 0x12}, /* Left shift */
+ {0xffe2, 0x59}, /* Right shift */
+ {0xffe3, 0x14}, /* Left control */
+ {0xffe4, 0x14, SCANCODE_E0_PREFIX}, /* Right control */
+ /* {0xffe7, XXX}, Left meta */
+ /* {0xffe8, XXX}, Right meta */
+ {0xffe9, 0x11}, /* Left alt */
+ {0xfe03, 0x11, SCANCODE_E0_PREFIX}, /* AltGr */
+ {0xffea, 0x11, SCANCODE_E0_PREFIX}, /* Right alt */
+ {0xffeb, 0x1f, SCANCODE_E0_PREFIX}, /* Left Windows */
+ {0xffec, 0x27, SCANCODE_E0_PREFIX}, /* Right Windows */
+ {0xffbe, 0x05}, /* F1 */
+ {0xffbf, 0x06}, /* F2 */
+ {0xffc0, 0x04}, /* F3 */
+ {0xffc1, 0x0c}, /* F4 */
+ {0xffc2, 0x03}, /* F5 */
+ {0xffc3, 0x0b}, /* F6 */
+ {0xffc4, 0x83}, /* F7 */
+ {0xffc5, 0x0a}, /* F8 */
+ {0xffc6, 0x01}, /* F9 */
+ {0xffc7, 0x09}, /* F10 */
+ {0xffc8, 0x78}, /* F11 */
+ {0xffc9, 0x07}, /* F12 */
+ {0xffff, 0x71, SCANCODE_E0_PREFIX}, /* Del */
+ {0xff14, 0x7e}, /* ScrollLock */
+ /* NumLock and Keypads*/
+ {0xff7f, 0x77}, /* NumLock */
+ {0xffaf, 0x4a, SCANCODE_E0_PREFIX}, /* Keypad slash */
+ {0xffaa, 0x7c}, /* Keypad asterisk */
+ {0xffad, 0x7b}, /* Keypad minus */
+ {0xffab, 0x79}, /* Keypad plus */
+ {0xffb7, 0x6c}, /* Keypad 7 */
+ {0xff95, 0x6c}, /* Keypad home */
+ {0xffb8, 0x75}, /* Keypad 8 */
+ {0xff97, 0x75}, /* Keypad up arrow */
+ {0xffb9, 0x7d}, /* Keypad 9 */
+ {0xff9a, 0x7d}, /* Keypad PgUp */
+ {0xffb4, 0x6b}, /* Keypad 4 */
+ {0xff96, 0x6b}, /* Keypad left arrow */
+ {0xffb5, 0x73}, /* Keypad 5 */
+ {0xff9d, 0x73}, /* Keypad empty */
+ {0xffb6, 0x74}, /* Keypad 6 */
+ {0xff98, 0x74}, /* Keypad right arrow */
+ {0xffb1, 0x69}, /* Keypad 1 */
+ {0xff9c, 0x69}, /* Keypad end */
+ {0xffb2, 0x72}, /* Keypad 2 */
+ {0xff99, 0x72}, /* Keypad down arrow */
+ {0xffb3, 0x7a}, /* Keypad 3 */
+ {0xff9b, 0x7a}, /* Keypad PgDown */
+ {0xffb0, 0x70}, /* Keypad 0 */
+ {0xff9e, 0x70}, /* Keypad ins */
+ {0xffae, 0x71}, /* Keypad . */
+ {0xff9f, 0x71}, /* Keypad del */
+ {0, 0, 0} /* Terminator */
+};
+
+/* ASCII to type 2 scancode lookup table */
+static const uint8_t ascii_translations[128] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
+ 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
+ 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
+ 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
+ 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+ 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+ 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+ 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
+ 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+ 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+ 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+ 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
+};
+
+static void
+fifo_init(struct ps2kbd_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2kbd_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_put(struct ps2kbd_softc *sc, uint8_t val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = val;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ }
+}
+
+static int
+fifo_get(struct ps2kbd_softc *sc, uint8_t *val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num > 0) {
+ *val = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (0);
+ }
+
+ return (-1);
+}
+
+int
+ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val)
+{
+ int retval;
+
+ pthread_mutex_lock(&sc->mtx);
+ retval = fifo_get(sc, val);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+void
+ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val)
+{
+ pthread_mutex_lock(&sc->mtx);
+ if (sc->curcmd) {
+ switch (sc->curcmd) {
+ case PS2KC_SET_TYPEMATIC:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_SCANCODE_SET:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_LEDS:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 keyboard current "
+ "command byte 0x%02x\n", val);
+ break;
+ }
+ sc->curcmd = 0;
+ } else {
+ switch (val) {
+ case 0x00:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_RESET_DEV:
+ fifo_reset(sc);
+ fifo_put(sc, PS2KC_ACK);
+ fifo_put(sc, PS2KC_BAT_SUCCESS);
+ break;
+ case PS2KC_DISABLE:
+ sc->enabled = false;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_ENABLE:
+ sc->enabled = true;
+ fifo_reset(sc);
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_TYPEMATIC:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SEND_DEV_ID:
+ fifo_put(sc, PS2KC_ACK);
+ fifo_put(sc, 0xab);
+ fifo_put(sc, 0x83);
+ break;
+ case PS2KC_SET_SCANCODE_SET:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_ECHO:
+ fifo_put(sc, PS2KC_ECHO);
+ break;
+ case PS2KC_SET_LEDS:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 keyboard command "
+ "0x%02x\n", val);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+/*
+ * Translate keysym to type 2 scancode and insert into keyboard buffer.
+ */
+static void
+ps2kbd_keysym_queue(struct ps2kbd_softc *sc,
+ int down, uint32_t keysym)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+ int e0_prefix, found;
+ uint8_t code;
+ const struct extended_translation *trans;
+
+ found = 0;
+ if (keysym < 0x80) {
+ code = ascii_translations[keysym];
+ e0_prefix = 0;
+ found = 1;
+ } else {
+ for (trans = &(extended_translations[0]); trans->keysym != 0;
+ trans++) {
+ if (keysym == trans->keysym) {
+ code = trans->scancode;
+ e0_prefix = trans->flags & SCANCODE_E0_PREFIX;
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", keysym);
+ return;
+ }
+
+ if (e0_prefix)
+ fifo_put(sc, 0xe0);
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, code);
+}
+
+static void
+ps2kbd_event(int down, uint32_t keysym, void *arg)
+{
+ struct ps2kbd_softc *sc = arg;
+ int fifo_full;
+
+ pthread_mutex_lock(&sc->mtx);
+ if (!sc->enabled) {
+ pthread_mutex_unlock(&sc->mtx);
+ return;
+ }
+ fifo_full = sc->fifo.num == PS2KBD_FIFOSZ;
+ ps2kbd_keysym_queue(sc, down, keysym);
+ pthread_mutex_unlock(&sc->mtx);
+
+ if (!fifo_full)
+ atkbdc_event(sc->atkbdc_sc, 1);
+}
+
+struct ps2kbd_softc *
+ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
+{
+ struct ps2kbd_softc *sc;
+
+ sc = calloc(1, sizeof (struct ps2kbd_softc));
+ pthread_mutex_init(&sc->mtx, NULL);
+ fifo_init(sc);
+ sc->atkbdc_sc = atkbdc_sc;
+
+ console_kbd_register(ps2kbd_event, sc, 1);
+
+ return (sc);
+}
+
diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h
new file mode 100644
index 0000000000..17be6d0466
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2KBD_H_
+#define _PS2KBD_H_
+
+struct atkbdc_softc;
+
+struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
+
+int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
+void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
+
+#endif /* _PS2KBD_H_ */
diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c
new file mode 100644
index 0000000000..b2e08262b1
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.c
@@ -0,0 +1,418 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* mouse device commands */
+#define PS2MC_RESET_DEV 0xff
+#define PS2MC_SET_DEFAULTS 0xf6
+#define PS2MC_DISABLE 0xf5
+#define PS2MC_ENABLE 0xf4
+#define PS2MC_SET_SAMPLING_RATE 0xf3
+#define PS2MC_SEND_DEV_ID 0xf2
+#define PS2MC_SET_REMOTE_MODE 0xf0
+#define PS2MC_SEND_DEV_DATA 0xeb
+#define PS2MC_SET_STREAM_MODE 0xea
+#define PS2MC_SEND_DEV_STATUS 0xe9
+#define PS2MC_SET_RESOLUTION 0xe8
+#define PS2MC_SET_SCALING1 0xe7
+#define PS2MC_SET_SCALING2 0xe6
+
+#define PS2MC_BAT_SUCCESS 0xaa
+#define PS2MC_ACK 0xfa
+
+/* mouse device id */
+#define PS2MOUSE_DEV_ID 0x0
+
+/* mouse data bits */
+#define PS2M_DATA_Y_OFLOW 0x80
+#define PS2M_DATA_X_OFLOW 0x40
+#define PS2M_DATA_Y_SIGN 0x20
+#define PS2M_DATA_X_SIGN 0x10
+#define PS2M_DATA_AONE 0x08
+#define PS2M_DATA_MID_BUTTON 0x04
+#define PS2M_DATA_RIGHT_BUTTON 0x02
+#define PS2M_DATA_LEFT_BUTTON 0x01
+
+/* mouse status bits */
+#define PS2M_STS_REMOTE_MODE 0x40
+#define PS2M_STS_ENABLE_DEV 0x20
+#define PS2M_STS_SCALING_21 0x10
+#define PS2M_STS_MID_BUTTON 0x04
+#define PS2M_STS_RIGHT_BUTTON 0x02
+#define PS2M_STS_LEFT_BUTTON 0x01
+
+#define PS2MOUSE_FIFOSZ 16
+
+struct fifo {
+ uint8_t buf[PS2MOUSE_FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of bytes in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct ps2mouse_softc {
+ struct atkbdc_softc *atkbdc_sc;
+ pthread_mutex_t mtx;
+
+ uint8_t status;
+ uint8_t resolution;
+ uint8_t sampling_rate;
+ int ctrlenable;
+ struct fifo fifo;
+
+ uint8_t curcmd; /* current command for next byte */
+
+ int cur_x, cur_y;
+ int delta_x, delta_y;
+};
+
+static void
+fifo_init(struct ps2mouse_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2mouse_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_put(struct ps2mouse_softc *sc, uint8_t val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = val;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ }
+}
+
+static int
+fifo_get(struct ps2mouse_softc *sc, uint8_t *val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num > 0) {
+ *val = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (0);
+ }
+
+ return (-1);
+}
+
+static void
+movement_reset(struct ps2mouse_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ sc->delta_x = 0;
+ sc->delta_y = 0;
+}
+
+static void
+movement_update(struct ps2mouse_softc *sc, int x, int y)
+{
+ sc->delta_x += x - sc->cur_x;
+ sc->delta_y += sc->cur_y - y;
+ sc->cur_x = x;
+ sc->cur_y = y;
+}
+
+static void
+movement_get(struct ps2mouse_softc *sc)
+{
+ uint8_t val0, val1, val2;
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ val0 = PS2M_DATA_AONE;
+ val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON |
+ PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON);
+
+ if (sc->delta_x >= 0) {
+ if (sc->delta_x > 255) {
+ val0 |= PS2M_DATA_X_OFLOW;
+ val1 = 255;
+ } else
+ val1 = sc->delta_x;
+ } else {
+ val0 |= PS2M_DATA_X_SIGN;
+ if (sc->delta_x < -255) {
+ val0 |= PS2M_DATA_X_OFLOW;
+ val1 = 255;
+ } else
+ val1 = sc->delta_x;
+ }
+ sc->delta_x = 0;
+
+ if (sc->delta_y >= 0) {
+ if (sc->delta_y > 255) {
+ val0 |= PS2M_DATA_Y_OFLOW;
+ val2 = 255;
+ } else
+ val2 = sc->delta_y;
+ } else {
+ val0 |= PS2M_DATA_Y_SIGN;
+ if (sc->delta_y < -255) {
+ val0 |= PS2M_DATA_Y_OFLOW;
+ val2 = 255;
+ } else
+ val2 = sc->delta_y;
+ }
+ sc->delta_y = 0;
+
+ if (sc->fifo.num < (sc->fifo.size - 3)) {
+ fifo_put(sc, val0);
+ fifo_put(sc, val1);
+ fifo_put(sc, val2);
+ }
+}
+
+static void
+ps2mouse_reset(struct ps2mouse_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+ fifo_reset(sc);
+ movement_reset(sc);
+ sc->status = PS2M_STS_ENABLE_DEV;
+ sc->resolution = 4;
+ sc->sampling_rate = 100;
+
+ sc->cur_x = 0;
+ sc->cur_y = 0;
+ sc->delta_x = 0;
+ sc->delta_y = 0;
+}
+
+int
+ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val)
+{
+ int retval;
+
+ pthread_mutex_lock(&sc->mtx);
+ retval = fifo_get(sc, val);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+int
+ps2mouse_fifocnt(struct ps2mouse_softc *sc)
+{
+ return (sc->fifo.num);
+}
+
+void
+ps2mouse_toggle(struct ps2mouse_softc *sc, int enable)
+{
+ pthread_mutex_lock(&sc->mtx);
+ if (enable)
+ sc->ctrlenable = 1;
+ else {
+ sc->ctrlenable = 0;
+ sc->fifo.rindex = 0;
+ sc->fifo.windex = 0;
+ sc->fifo.num = 0;
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert)
+{
+ pthread_mutex_lock(&sc->mtx);
+ fifo_reset(sc);
+ if (sc->curcmd) {
+ switch (sc->curcmd) {
+ case PS2MC_SET_SAMPLING_RATE:
+ sc->sampling_rate = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_RESOLUTION:
+ sc->resolution = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 mouse current "
+ "command byte 0x%02x\n", val);
+ break;
+ }
+ sc->curcmd = 0;
+
+ } else if (insert) {
+ fifo_put(sc, val);
+ } else {
+ switch (val) {
+ case 0x00:
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_RESET_DEV:
+ ps2mouse_reset(sc);
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, PS2MC_BAT_SUCCESS);
+ fifo_put(sc, PS2MOUSE_DEV_ID);
+ break;
+ case PS2MC_SET_DEFAULTS:
+ ps2mouse_reset(sc);
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_DISABLE:
+ fifo_reset(sc);
+ sc->status &= ~PS2M_STS_ENABLE_DEV;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_ENABLE:
+ fifo_reset(sc);
+ sc->status |= PS2M_STS_ENABLE_DEV;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_SAMPLING_RATE:
+ sc->curcmd = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_ID:
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, PS2MOUSE_DEV_ID);
+ break;
+ case PS2MC_SET_REMOTE_MODE:
+ sc->status |= PS2M_STS_REMOTE_MODE;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_DATA:
+ fifo_put(sc, PS2MC_ACK);
+ movement_get(sc);
+ break;
+ case PS2MC_SET_STREAM_MODE:
+ sc->status &= ~PS2M_STS_REMOTE_MODE;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_STATUS:
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, sc->status);
+ fifo_put(sc, sc->resolution);
+ fifo_put(sc, sc->sampling_rate);
+ break;
+ case PS2MC_SET_RESOLUTION:
+ sc->curcmd = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_SCALING1:
+ case PS2MC_SET_SCALING2:
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ default:
+ fifo_put(sc, PS2MC_ACK);
+ fprintf(stderr, "Unhandled ps2 mouse command "
+ "0x%02x\n", val);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+ps2mouse_event(uint8_t button, int x, int y, void *arg)
+{
+ struct ps2mouse_softc *sc = arg;
+
+ pthread_mutex_lock(&sc->mtx);
+ movement_update(sc, x, y);
+
+ sc->status &= ~(PS2M_STS_LEFT_BUTTON |
+ PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+ if (button & (1 << 0))
+ sc->status |= PS2M_STS_LEFT_BUTTON;
+ if (button & (1 << 1))
+ sc->status |= PS2M_STS_MID_BUTTON;
+ if (button & (1 << 2))
+ sc->status |= PS2M_STS_RIGHT_BUTTON;
+
+ if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) {
+ /* no data reporting */
+ pthread_mutex_unlock(&sc->mtx);
+ return;
+ }
+
+ movement_get(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ if (sc->fifo.num > 0)
+ atkbdc_event(sc->atkbdc_sc, 0);
+}
+
+struct ps2mouse_softc *
+ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
+{
+ struct ps2mouse_softc *sc;
+
+ sc = calloc(1, sizeof (struct ps2mouse_softc));
+ pthread_mutex_init(&sc->mtx, NULL);
+ fifo_init(sc);
+ sc->atkbdc_sc = atkbdc_sc;
+
+ pthread_mutex_lock(&sc->mtx);
+ ps2mouse_reset(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ console_ptr_register(ps2mouse_event, sc, 1);
+
+ return (sc);
+}
+
+
diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h
new file mode 100644
index 0000000000..59430b01e2
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2MOUSE_H_
+#define _PS2MOUSE_H_
+
+struct atkbdc_softc;
+
+struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
+
+int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val);
+void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert);
+void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable);
+int ps2mouse_fifocnt(struct ps2mouse_softc *sc);
+
+#endif /* _PS2MOUSE_H_ */
diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c
new file mode 100644
index 0000000000..39ea1611f9
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.c
@@ -0,0 +1,1148 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Leon Dang
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/endian.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <arpa/inet.h>
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <zlib.h>
+
+#ifndef __FreeBSD__
+#include <sys/debug.h>
+#endif
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "rfb.h"
+#include "sockstream.h"
+
+#ifndef NO_OPENSSL
+#include <openssl/des.h>
+#endif
+
+static int rfb_debug = 0;
+#define DPRINTF(params) if (rfb_debug) printf params
+#define WPRINTF(params) printf params
+
+#define AUTH_LENGTH 16
+#define PASSWD_LENGTH 8
+
+#define SECURITY_TYPE_NONE 1
+#define SECURITY_TYPE_VNC_AUTH 2
+
+#define AUTH_FAILED_UNAUTH 1
+#define AUTH_FAILED_ERROR 2
+
+struct rfb_softc {
+ int sfd;
+ pthread_t tid;
+
+ int cfd;
+
+ int width, height;
+
+ char *password;
+
+ bool enc_raw_ok;
+ bool enc_zlib_ok;
+ bool enc_resize_ok;
+
+ z_stream zstream;
+ uint8_t *zbuf;
+ int zbuflen;
+
+ int conn_wait;
+ int sending;
+ pthread_mutex_t mtx;
+ pthread_cond_t cond;
+
+ int hw_crc;
+ uint32_t *crc; /* WxH crc cells */
+ uint32_t *crc_tmp; /* buffer to store single crc row */
+ int crc_width, crc_height;
+};
+
+struct rfb_pixfmt {
+ uint8_t bpp;
+ uint8_t depth;
+ uint8_t bigendian;
+ uint8_t truecolor;
+ uint16_t red_max;
+ uint16_t green_max;
+ uint16_t blue_max;
+ uint8_t red_shift;
+ uint8_t green_shift;
+ uint8_t blue_shift;
+ uint8_t pad[3];
+};
+
+struct rfb_srvr_info {
+ uint16_t width;
+ uint16_t height;
+ struct rfb_pixfmt pixfmt;
+ uint32_t namelen;
+};
+
+struct rfb_pixfmt_msg {
+ uint8_t type;
+ uint8_t pad[3];
+ struct rfb_pixfmt pixfmt;
+};
+
+#define RFB_ENCODING_RAW 0
+#define RFB_ENCODING_ZLIB 6
+#define RFB_ENCODING_RESIZE -223
+
+#define RFB_MAX_WIDTH 2000
+#define RFB_MAX_HEIGHT 1200
+#define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4
+
+/* percentage changes to screen before sending the entire screen */
+#define RFB_SEND_ALL_THRESH 25
+
+struct rfb_enc_msg {
+ uint8_t type;
+ uint8_t pad;
+ uint16_t numencs;
+};
+
+struct rfb_updt_msg {
+ uint8_t type;
+ uint8_t incremental;
+ uint16_t x;
+ uint16_t y;
+ uint16_t width;
+ uint16_t height;
+};
+
+struct rfb_key_msg {
+ uint8_t type;
+ uint8_t down;
+ uint16_t pad;
+ uint32_t code;
+};
+
+struct rfb_ptr_msg {
+ uint8_t type;
+ uint8_t button;
+ uint16_t x;
+ uint16_t y;
+};
+
+struct rfb_srvr_updt_msg {
+ uint8_t type;
+ uint8_t pad;
+ uint16_t numrects;
+};
+
+struct rfb_srvr_rect_hdr {
+ uint16_t x;
+ uint16_t y;
+ uint16_t width;
+ uint16_t height;
+ uint32_t encoding;
+};
+
+struct rfb_cuttext_msg {
+ uint8_t type;
+ uint8_t padding[3];
+ uint32_t length;
+};
+
+
+static void
+rfb_send_server_init_msg(int cfd)
+{
+ struct bhyvegc_image *gc_image;
+ struct rfb_srvr_info sinfo;
+
+ gc_image = console_get_image();
+
+ sinfo.width = htons(gc_image->width);
+ sinfo.height = htons(gc_image->height);
+ sinfo.pixfmt.bpp = 32;
+ sinfo.pixfmt.depth = 32;
+ sinfo.pixfmt.bigendian = 0;
+ sinfo.pixfmt.truecolor = 1;
+ sinfo.pixfmt.red_max = htons(255);
+ sinfo.pixfmt.green_max = htons(255);
+ sinfo.pixfmt.blue_max = htons(255);
+ sinfo.pixfmt.red_shift = 16;
+ sinfo.pixfmt.green_shift = 8;
+ sinfo.pixfmt.blue_shift = 0;
+ sinfo.namelen = htonl(strlen("bhyve"));
+ (void)stream_write(cfd, &sinfo, sizeof(sinfo));
+ (void)stream_write(cfd, "bhyve", strlen("bhyve"));
+}
+
+static void
+rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = htons(1);
+ stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+
+ /* Rectangle header */
+ srect_hdr.x = htons(0);
+ srect_hdr.y = htons(0);
+ srect_hdr.width = htons(rc->width);
+ srect_hdr.height = htons(rc->height);
+ srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE);
+ stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+}
+
+static void
+rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_pixfmt_msg pixfmt_msg;
+
+ (void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1);
+}
+
+
+static void
+rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_enc_msg enc_msg;
+ int i;
+ uint32_t encoding;
+
+ assert((sizeof(enc_msg) - 1) == 3);
+ (void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1);
+
+ for (i = 0; i < htons(enc_msg.numencs); i++) {
+ (void)stream_read(cfd, &encoding, sizeof(encoding));
+ switch (htonl(encoding)) {
+ case RFB_ENCODING_RAW:
+ rc->enc_raw_ok = true;
+ break;
+ case RFB_ENCODING_ZLIB:
+ rc->enc_zlib_ok = true;
+ deflateInit(&rc->zstream, Z_BEST_SPEED);
+ break;
+ case RFB_ENCODING_RESIZE:
+ rc->enc_resize_ok = true;
+ break;
+ }
+ }
+}
+
+/*
+ * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only
+ */
+static __inline uint32_t
+fast_crc32(void *buf, int len, uint32_t crcval)
+{
+ uint32_t q = len / sizeof(uint32_t);
+ uint32_t *p = (uint32_t *)buf;
+
+ while (q--) {
+ asm volatile (
+ ".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;"
+ :"=S" (crcval)
+ :"0" (crcval), "c" (*p)
+ );
+ p++;
+ }
+
+ return (crcval);
+}
+
+
+static int
+rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc,
+ int x, int y, int w, int h)
+{
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+ unsigned long zlen;
+ ssize_t nwrite, total;
+ int err;
+ uint32_t *p;
+ uint8_t *zbufp;
+
+ /*
+ * Send a single rectangle of the given x, y, w h dimensions.
+ */
+
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = htons(1);
+ nwrite = stream_write(cfd, &supdt_msg,
+ sizeof(struct rfb_srvr_updt_msg));
+ if (nwrite <= 0)
+ return (nwrite);
+
+
+ /* Rectangle header */
+ srect_hdr.x = htons(x);
+ srect_hdr.y = htons(y);
+ srect_hdr.width = htons(w);
+ srect_hdr.height = htons(h);
+
+ h = y + h;
+ w *= sizeof(uint32_t);
+ if (rc->enc_zlib_ok) {
+ zbufp = rc->zbuf;
+ rc->zstream.total_in = 0;
+ rc->zstream.total_out = 0;
+ for (p = &gc->data[y * gc->width + x]; y < h; y++) {
+ rc->zstream.next_in = (Bytef *)p;
+ rc->zstream.avail_in = w;
+ rc->zstream.next_out = (Bytef *)zbufp;
+ rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 -
+ rc->zstream.total_out;
+ rc->zstream.data_type = Z_BINARY;
+
+ /* Compress with zlib */
+ err = deflate(&rc->zstream, Z_SYNC_FLUSH);
+ if (err != Z_OK) {
+ WPRINTF(("zlib[rect] deflate err: %d\n", err));
+ rc->enc_zlib_ok = false;
+ deflateEnd(&rc->zstream);
+ goto doraw;
+ }
+ zbufp = rc->zbuf + rc->zstream.total_out;
+ p += gc->width;
+ }
+ srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB);
+ nwrite = stream_write(cfd, &srect_hdr,
+ sizeof(struct rfb_srvr_rect_hdr));
+ if (nwrite <= 0)
+ return (nwrite);
+
+ zlen = htonl(rc->zstream.total_out);
+ nwrite = stream_write(cfd, &zlen, sizeof(uint32_t));
+ if (nwrite <= 0)
+ return (nwrite);
+ return (stream_write(cfd, rc->zbuf, rc->zstream.total_out));
+ }
+
+doraw:
+
+ total = 0;
+ zbufp = rc->zbuf;
+ for (p = &gc->data[y * gc->width + x]; y < h; y++) {
+ memcpy(zbufp, p, w);
+ zbufp += w;
+ total += w;
+ p += gc->width;
+ }
+
+ srect_hdr.encoding = htonl(RFB_ENCODING_RAW);
+ nwrite = stream_write(cfd, &srect_hdr,
+ sizeof(struct rfb_srvr_rect_hdr));
+ if (nwrite <= 0)
+ return (nwrite);
+
+ total = stream_write(cfd, rc->zbuf, total);
+
+ return (total);
+}
+
+static int
+rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc)
+{
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+ ssize_t nwrite;
+ unsigned long zlen;
+ int err;
+
+ /*
+ * Send the whole thing
+ */
+
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = htons(1);
+ nwrite = stream_write(cfd, &supdt_msg,
+ sizeof(struct rfb_srvr_updt_msg));
+ if (nwrite <= 0)
+ return (nwrite);
+
+ /* Rectangle header */
+ srect_hdr.x = 0;
+ srect_hdr.y = 0;
+ srect_hdr.width = htons(gc->width);
+ srect_hdr.height = htons(gc->height);
+ if (rc->enc_zlib_ok) {
+ rc->zstream.next_in = (Bytef *)gc->data;
+ rc->zstream.avail_in = gc->width * gc->height *
+ sizeof(uint32_t);
+ rc->zstream.next_out = (Bytef *)rc->zbuf;
+ rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16;
+ rc->zstream.data_type = Z_BINARY;
+
+ rc->zstream.total_in = 0;
+ rc->zstream.total_out = 0;
+
+ /* Compress with zlib */
+ err = deflate(&rc->zstream, Z_SYNC_FLUSH);
+ if (err != Z_OK) {
+ WPRINTF(("zlib deflate err: %d\n", err));
+ rc->enc_zlib_ok = false;
+ deflateEnd(&rc->zstream);
+ goto doraw;
+ }
+
+ srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB);
+ nwrite = stream_write(cfd, &srect_hdr,
+ sizeof(struct rfb_srvr_rect_hdr));
+ if (nwrite <= 0)
+ return (nwrite);
+
+ zlen = htonl(rc->zstream.total_out);
+ nwrite = stream_write(cfd, &zlen, sizeof(uint32_t));
+ if (nwrite <= 0)
+ return (nwrite);
+ return (stream_write(cfd, rc->zbuf, rc->zstream.total_out));
+ }
+
+doraw:
+ srect_hdr.encoding = htonl(RFB_ENCODING_RAW);
+ nwrite = stream_write(cfd, &srect_hdr,
+ sizeof(struct rfb_srvr_rect_hdr));
+ if (nwrite <= 0)
+ return (nwrite);
+
+ nwrite = stream_write(cfd, gc->data,
+ gc->width * gc->height * sizeof(uint32_t));
+
+ return (nwrite);
+}
+
+#define PIX_PER_CELL 32
+#define PIXCELL_SHIFT 5
+#define PIXCELL_MASK 0x1F
+
+static int
+rfb_send_screen(struct rfb_softc *rc, int cfd, int all)
+{
+ struct bhyvegc_image *gc_image;
+ ssize_t nwrite;
+ int x, y;
+ int celly, cellwidth;
+ int xcells, ycells;
+ int w, h;
+ uint32_t *p;
+ int rem_x, rem_y; /* remainder for resolutions not x32 pixels ratio */
+ int retval;
+ uint32_t *crc_p, *orig_crc;
+ int changes;
+
+ console_refresh();
+ gc_image = console_get_image();
+
+ pthread_mutex_lock(&rc->mtx);
+ if (rc->sending) {
+ pthread_mutex_unlock(&rc->mtx);
+ return (1);
+ }
+ rc->sending = 1;
+ pthread_mutex_unlock(&rc->mtx);
+
+ retval = 0;
+
+ if (all) {
+ retval = rfb_send_all(rc, cfd, gc_image);
+ goto done;
+ }
+
+ /*
+ * Calculate the checksum for each 32x32 cell. Send each that
+ * has changed since the last scan.
+ */
+
+ /* Resolution changed */
+
+ rc->crc_width = gc_image->width;
+ rc->crc_height = gc_image->height;
+
+ w = rc->crc_width;
+ h = rc->crc_height;
+ xcells = howmany(rc->crc_width, PIX_PER_CELL);
+ ycells = howmany(rc->crc_height, PIX_PER_CELL);
+
+ rem_x = w & PIXCELL_MASK;
+
+ rem_y = h & PIXCELL_MASK;
+ if (!rem_y)
+ rem_y = PIX_PER_CELL;
+
+ p = gc_image->data;
+
+ /*
+ * Go through all cells and calculate crc. If significant number
+ * of changes, then send entire screen.
+ * crc_tmp is dual purpose: to store the new crc and to flag as
+ * a cell that has changed.
+ */
+ crc_p = rc->crc_tmp - xcells;
+ orig_crc = rc->crc - xcells;
+ changes = 0;
+ memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells);
+ for (y = 0; y < h; y++) {
+ if ((y & PIXCELL_MASK) == 0) {
+ crc_p += xcells;
+ orig_crc += xcells;
+ }
+
+ for (x = 0; x < xcells; x++) {
+ if (x == (xcells - 1) && rem_x > 0)
+ cellwidth = rem_x;
+ else
+ cellwidth = PIX_PER_CELL;
+
+ if (rc->hw_crc)
+ crc_p[x] = fast_crc32(p,
+ cellwidth * sizeof(uint32_t),
+ crc_p[x]);
+ else
+ crc_p[x] = (uint32_t)crc32(crc_p[x],
+ (Bytef *)p,
+ cellwidth * sizeof(uint32_t));
+
+ p += cellwidth;
+
+ /* check for crc delta if last row in cell */
+ if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) {
+ if (orig_crc[x] != crc_p[x]) {
+ orig_crc[x] = crc_p[x];
+ crc_p[x] = 1;
+ changes++;
+ } else {
+ crc_p[x] = 0;
+ }
+ }
+ }
+ }
+
+ /* If number of changes is > THRESH percent, send the whole screen */
+ if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) {
+ retval = rfb_send_all(rc, cfd, gc_image);
+ goto done;
+ }
+
+ /* Go through all cells, and send only changed ones */
+ crc_p = rc->crc_tmp;
+ for (y = 0; y < h; y += PIX_PER_CELL) {
+ /* previous cell's row */
+ celly = (y >> PIXCELL_SHIFT);
+
+ /* Delta check crc to previous set */
+ for (x = 0; x < xcells; x++) {
+ if (*crc_p++ == 0)
+ continue;
+
+ if (x == (xcells - 1) && rem_x > 0)
+ cellwidth = rem_x;
+ else
+ cellwidth = PIX_PER_CELL;
+ nwrite = rfb_send_rect(rc, cfd,
+ gc_image,
+ x * PIX_PER_CELL,
+ celly * PIX_PER_CELL,
+ cellwidth,
+ y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL);
+ if (nwrite <= 0) {
+ retval = nwrite;
+ goto done;
+ }
+ }
+ }
+ retval = 1;
+
+done:
+ pthread_mutex_lock(&rc->mtx);
+ rc->sending = 0;
+ pthread_mutex_unlock(&rc->mtx);
+
+ return (retval);
+}
+
+
+static void
+rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly)
+{
+ struct rfb_updt_msg updt_msg;
+ struct bhyvegc_image *gc_image;
+
+ (void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1);
+
+ console_refresh();
+ gc_image = console_get_image();
+
+ updt_msg.x = htons(updt_msg.x);
+ updt_msg.y = htons(updt_msg.y);
+ updt_msg.width = htons(updt_msg.width);
+ updt_msg.height = htons(updt_msg.height);
+
+ if (updt_msg.width != gc_image->width ||
+ updt_msg.height != gc_image->height) {
+ rc->width = gc_image->width;
+ rc->height = gc_image->height;
+ if (rc->enc_resize_ok)
+ rfb_send_resize_update_msg(rc, cfd);
+ }
+
+ if (discardonly)
+ return;
+
+ rfb_send_screen(rc, cfd, 1);
+}
+
+static void
+rfb_recv_key_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_key_msg key_msg;
+
+ (void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1);
+
+ console_key_event(key_msg.down, htonl(key_msg.code));
+}
+
+static void
+rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_ptr_msg ptr_msg;
+
+ (void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1);
+
+ console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y));
+}
+
+static void
+rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_cuttext_msg ct_msg;
+ unsigned char buf[32];
+ int len;
+
+ len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1);
+ ct_msg.length = htonl(ct_msg.length);
+ while (ct_msg.length > 0) {
+ len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ?
+ sizeof(buf) : ct_msg.length);
+ ct_msg.length -= len;
+ }
+}
+
+static int64_t
+timeval_delta(struct timeval *prev, struct timeval *now)
+{
+ int64_t n1, n2;
+ n1 = now->tv_sec * 1000000 + now->tv_usec;
+ n2 = prev->tv_sec * 1000000 + prev->tv_usec;
+ return (n1 - n2);
+}
+
+static void *
+rfb_wr_thr(void *arg)
+{
+ struct rfb_softc *rc;
+ fd_set rfds;
+ struct timeval tv;
+ struct timeval prev_tv;
+ int64_t tdiff;
+ int cfd;
+ int err;
+
+ rc = arg;
+ cfd = rc->cfd;
+
+ prev_tv.tv_sec = 0;
+ prev_tv.tv_usec = 0;
+ while (rc->cfd >= 0) {
+ FD_ZERO(&rfds);
+ FD_SET(cfd, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 10000;
+
+ err = select(cfd+1, &rfds, NULL, NULL, &tv);
+ if (err < 0)
+ return (NULL);
+
+ /* Determine if its time to push screen; ~24hz */
+ gettimeofday(&tv, NULL);
+ tdiff = timeval_delta(&prev_tv, &tv);
+ if (tdiff > 40000) {
+ prev_tv.tv_sec = tv.tv_sec;
+ prev_tv.tv_usec = tv.tv_usec;
+ if (rfb_send_screen(rc, cfd, 0) <= 0) {
+ return (NULL);
+ }
+ } else {
+ /* sleep */
+ usleep(40000 - tdiff);
+ }
+ }
+
+ return (NULL);
+}
+
+void
+rfb_handle(struct rfb_softc *rc, int cfd)
+{
+ const char *vbuf = "RFB 003.008\n";
+ unsigned char buf[80];
+ unsigned char *message = NULL;
+
+#ifndef NO_OPENSSL
+ unsigned char challenge[AUTH_LENGTH];
+ unsigned char keystr[PASSWD_LENGTH];
+ unsigned char crypt_expected[AUTH_LENGTH];
+
+ DES_key_schedule ks;
+ int i;
+#endif
+
+ pthread_t tid;
+ uint32_t sres = 0;
+ int len;
+ int perror = 1;
+
+ rc->cfd = cfd;
+
+ /* 1a. Send server version */
+ stream_write(cfd, vbuf, strlen(vbuf));
+
+ /* 1b. Read client version */
+ len = read(cfd, buf, sizeof(buf));
+
+ /* 2a. Send security type */
+ buf[0] = 1;
+#ifndef NO_OPENSSL
+ if (rc->password)
+ buf[1] = SECURITY_TYPE_VNC_AUTH;
+ else
+ buf[1] = SECURITY_TYPE_NONE;
+#else
+ buf[1] = SECURITY_TYPE_NONE;
+#endif
+
+ stream_write(cfd, buf, 2);
+
+ /* 2b. Read agreed security type */
+ len = stream_read(cfd, buf, 1);
+
+ /* 2c. Do VNC authentication */
+ switch (buf[0]) {
+ case SECURITY_TYPE_NONE:
+ sres = 0;
+ break;
+ case SECURITY_TYPE_VNC_AUTH:
+ /*
+ * The client encrypts the challenge with DES, using a password
+ * supplied by the user as the key.
+ * To form the key, the password is truncated to
+ * eight characters, or padded with null bytes on the right.
+ * The client then sends the resulting 16-bytes response.
+ */
+#ifndef NO_OPENSSL
+ strncpy(keystr, rc->password, PASSWD_LENGTH);
+
+ /* VNC clients encrypts the challenge with all the bit fields
+ * in each byte of the password mirrored.
+ * Here we flip each byte of the keystr.
+ */
+ for (i = 0; i < PASSWD_LENGTH; i++) {
+ keystr[i] = (keystr[i] & 0xF0) >> 4
+ | (keystr[i] & 0x0F) << 4;
+ keystr[i] = (keystr[i] & 0xCC) >> 2
+ | (keystr[i] & 0x33) << 2;
+ keystr[i] = (keystr[i] & 0xAA) >> 1
+ | (keystr[i] & 0x55) << 1;
+ }
+
+ /* Initialize a 16-byte random challenge */
+ arc4random_buf(challenge, sizeof(challenge));
+ stream_write(cfd, challenge, AUTH_LENGTH);
+
+ /* Receive the 16-byte challenge response */
+ stream_read(cfd, buf, AUTH_LENGTH);
+
+ memcpy(crypt_expected, challenge, AUTH_LENGTH);
+
+ /* Encrypt the Challenge with DES */
+ DES_set_key((const_DES_cblock *)keystr, &ks);
+ DES_ecb_encrypt((const_DES_cblock *)challenge,
+ (const_DES_cblock *)crypt_expected,
+ &ks, DES_ENCRYPT);
+ DES_ecb_encrypt((const_DES_cblock *)(challenge + PASSWD_LENGTH),
+ (const_DES_cblock *)(crypt_expected +
+ PASSWD_LENGTH),
+ &ks, DES_ENCRYPT);
+
+ if (memcmp(crypt_expected, buf, AUTH_LENGTH) != 0) {
+ message = "Auth Failed: Invalid Password.";
+ sres = htonl(1);
+ } else
+ sres = 0;
+#else
+ sres = 0;
+ WPRINTF(("Auth not supported, no OpenSSL in your system"));
+#endif
+
+ break;
+ }
+
+ /* 2d. Write back a status */
+ stream_write(cfd, &sres, 4);
+
+ if (sres) {
+#ifdef __FreeBSD__
+ be32enc(buf, strlen(message));
+ stream_write(cfd, buf, 4);
+ stream_write(cfd, message, strlen(message));
+#else
+ be32enc(buf, strlen((char *)message));
+ stream_write(cfd, buf, 4);
+ stream_write(cfd, message, strlen((char *)message));
+#endif
+ goto done;
+ }
+
+ /* 3a. Read client shared-flag byte */
+ len = stream_read(cfd, buf, 1);
+
+ /* 4a. Write server-init info */
+ rfb_send_server_init_msg(cfd);
+
+ if (!rc->zbuf) {
+ rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16);
+ assert(rc->zbuf != NULL);
+ }
+
+ rfb_send_screen(rc, cfd, 1);
+
+ perror = pthread_create(&tid, NULL, rfb_wr_thr, rc);
+ if (perror == 0)
+ pthread_set_name_np(tid, "rfbout");
+
+ /* Now read in client requests. 1st byte identifies type */
+ for (;;) {
+ len = read(cfd, buf, 1);
+ if (len <= 0) {
+ DPRINTF(("rfb client exiting\r\n"));
+ break;
+ }
+
+ switch (buf[0]) {
+ case 0:
+ rfb_recv_set_pixfmt_msg(rc, cfd);
+ break;
+ case 2:
+ rfb_recv_set_encodings_msg(rc, cfd);
+ break;
+ case 3:
+ rfb_recv_update_msg(rc, cfd, 1);
+ break;
+ case 4:
+ rfb_recv_key_msg(rc, cfd);
+ break;
+ case 5:
+ rfb_recv_ptr_msg(rc, cfd);
+ break;
+ case 6:
+ rfb_recv_cuttext_msg(rc, cfd);
+ break;
+ default:
+ WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff));
+ goto done;
+ }
+ }
+done:
+ rc->cfd = -1;
+ if (perror == 0)
+ pthread_join(tid, NULL);
+ if (rc->enc_zlib_ok)
+ deflateEnd(&rc->zstream);
+}
+
+static void *
+rfb_thr(void *arg)
+{
+ struct rfb_softc *rc;
+ sigset_t set;
+
+ int cfd;
+
+ rc = arg;
+
+ sigemptyset(&set);
+ sigaddset(&set, SIGPIPE);
+ if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) {
+ perror("pthread_sigmask");
+ return (NULL);
+ }
+
+ for (;;) {
+ rc->enc_raw_ok = false;
+ rc->enc_zlib_ok = false;
+ rc->enc_resize_ok = false;
+
+ cfd = accept(rc->sfd, NULL, NULL);
+ if (rc->conn_wait) {
+ pthread_mutex_lock(&rc->mtx);
+ pthread_cond_signal(&rc->cond);
+ pthread_mutex_unlock(&rc->mtx);
+ rc->conn_wait = 0;
+ }
+ rfb_handle(rc, cfd);
+ close(cfd);
+ }
+
+ /* NOTREACHED */
+ return (NULL);
+}
+
+static int
+sse42_supported(void)
+{
+ u_int cpu_registers[4], ecx;
+
+ do_cpuid(1, cpu_registers);
+
+ ecx = cpu_registers[2];
+
+ return ((ecx & CPUID2_SSE42) != 0);
+}
+
+int
+rfb_init(char *hostname, int port, int wait, char *password)
+{
+ int e;
+ char servname[6];
+ struct rfb_softc *rc;
+ struct addrinfo *ai;
+ struct addrinfo hints;
+ int on = 1;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ rc = calloc(1, sizeof(struct rfb_softc));
+
+ rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+ sizeof(uint32_t));
+ rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+ sizeof(uint32_t));
+ rc->crc_width = RFB_MAX_WIDTH;
+ rc->crc_height = RFB_MAX_HEIGHT;
+
+ rc->password = password;
+
+ snprintf(servname, sizeof(servname), "%d", port ? port : 5900);
+
+ if (!hostname || strlen(hostname) == 0)
+#if defined(INET)
+ hostname = "127.0.0.1";
+#elif defined(INET6)
+ hostname = "[::1]";
+#endif
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV | AI_PASSIVE;
+
+ if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) {
+ fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e));
+ return(-1);
+ }
+
+ rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0);
+ if (rc->sfd < 0) {
+ perror("socket");
+ freeaddrinfo(ai);
+ return (-1);
+ }
+
+ setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+ if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) {
+ perror("bind");
+ freeaddrinfo(ai);
+ return (-1);
+ }
+
+ if (listen(rc->sfd, 1) < 0) {
+ perror("listen");
+ freeaddrinfo(ai);
+ return (-1);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(rc->sfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ rc->hw_crc = sse42_supported();
+
+ rc->conn_wait = wait;
+ if (wait) {
+ pthread_mutex_init(&rc->mtx, NULL);
+ pthread_cond_init(&rc->cond, NULL);
+ }
+
+ pthread_create(&rc->tid, NULL, rfb_thr, rc);
+ pthread_set_name_np(rc->tid, "rfb");
+
+ if (wait) {
+ DPRINTF(("Waiting for rfb client...\n"));
+ pthread_mutex_lock(&rc->mtx);
+ pthread_cond_wait(&rc->cond, &rc->mtx);
+ pthread_mutex_unlock(&rc->mtx);
+ }
+
+ freeaddrinfo(ai);
+ return (0);
+}
+
+#ifndef __FreeBSD__
+int
+rfb_init_unix(char *path, int wait, char *password)
+{
+ struct rfb_softc *rc;
+ struct sockaddr_un sock;
+
+ if ((rc = calloc(1, sizeof (struct rfb_softc))) == NULL) {
+ perror("calloc");
+ return (-1);
+ }
+ rc->sfd = -1;
+
+ if ((rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+ sizeof (uint32_t))) == NULL) {
+ perror("calloc");
+ goto fail;
+ }
+ if ((rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+ sizeof (uint32_t))) == NULL) {
+ perror("calloc");
+ goto fail;
+ }
+ rc->crc_width = RFB_MAX_WIDTH;
+ rc->crc_height = RFB_MAX_HEIGHT;
+
+ rc->password = password;
+
+ rc->sfd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (rc->sfd < 0) {
+ perror("socket");
+ goto fail;
+ }
+
+ sock.sun_family = AF_UNIX;
+ if (strlcpy(sock.sun_path, path, sizeof (sock.sun_path)) >=
+ sizeof (sock.sun_path)) {
+ (void) fprintf(stderr, "socket path '%s' too long\n", path);
+ goto fail;
+ }
+
+ (void) unlink(path);
+ if (bind(rc->sfd, (struct sockaddr *)&sock, sizeof (sock)) < 0) {
+ perror("bind");
+ goto fail;
+ }
+
+ if (listen(rc->sfd, 1) < 0) {
+ perror("listen");
+ goto fail;
+ }
+
+ rc->hw_crc = sse42_supported();
+
+ rc->conn_wait = wait;
+ if (wait) {
+ VERIFY3S(pthread_mutex_init(&rc->mtx, NULL), ==, 0);
+ VERIFY3S(pthread_cond_init(&rc->cond, NULL), ==, 0);
+ }
+
+ VERIFY3S(pthread_create(&rc->tid, NULL, rfb_thr, rc), ==, 0);
+ pthread_set_name_np(rc->tid, "rfb");
+
+ if (wait) {
+ DPRINTF(("Waiting for rfb client...\n"));
+ VERIFY3S(pthread_mutex_lock(&rc->mtx), ==, 0);
+ VERIFY3S(pthread_cond_wait(&rc->cond, &rc->mtx), ==, 0);
+ VERIFY3S(pthread_mutex_unlock(&rc->mtx), ==, 0);
+ }
+
+ return (0);
+
+fail:
+ if (rc->sfd != -1) {
+ VERIFY3S(close(rc->sfd), ==, 0);
+ }
+ free(rc->crc);
+ free(rc->crc_tmp);
+ free(rc);
+ return (-1);
+}
+#endif
diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h
new file mode 100644
index 0000000000..990e2075ac
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.h
@@ -0,0 +1,42 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RFB_H_
+#define _RFB_H_
+
+#define RFB_PORT 5900
+
+int rfb_init(char *hostname, int port, int wait, char *password);
+#ifndef __FreeBSD__
+int rfb_init_unix(char *path, int wait, char *password);
+#endif
+
+#endif /* _RFB_H_ */
diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c
new file mode 100644
index 0000000000..09ca3f61ae
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.c
@@ -0,0 +1,131 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <time.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "pci_lpc.h"
+#include "rtc.h"
+
+#define IO_RTC 0x70
+
+#define RTC_LMEM_LSB 0x34
+#define RTC_LMEM_MSB 0x35
+#define RTC_HMEM_LSB 0x5b
+#define RTC_HMEM_SB 0x5c
+#define RTC_HMEM_MSB 0x5d
+
+#define m_64KB (64*1024)
+#define m_16MB (16*1024*1024)
+#define m_4GB (4ULL*1024*1024*1024)
+
+/*
+ * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970
+ */
+static time_t
+rtc_time(struct vmctx *ctx, int use_localtime)
+{
+ struct tm tm;
+ time_t t;
+
+ time(&t);
+ if (use_localtime) {
+ localtime_r(&t, &tm);
+ t = timegm(&tm);
+ }
+ return (t);
+}
+
+void
+rtc_init(struct vmctx *ctx, int use_localtime)
+{
+ size_t himem;
+ size_t lomem;
+ int err;
+
+ /* XXX init diag/reset code/equipment/checksum ? */
+
+ /*
+ * Report guest memory size in nvram cells as required by UEFI.
+ * Little-endian encoding.
+ * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
+ * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
+ */
+ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
+ err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem);
+ assert(err == 0);
+ err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8);
+ assert(err == 0);
+
+ himem = vm_get_highmem_size(ctx) / m_64KB;
+ err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem);
+ assert(err == 0);
+ err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8);
+ assert(err == 0);
+ err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16);
+ assert(err == 0);
+
+ err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime));
+ assert(err == 0);
+}
+
+static void
+rtc_dsdt(void)
+{
+
+ dsdt_line("");
+ dsdt_line("Device (RTC)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_RTC, 2);
+ dsdt_fixed_irq(8);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(rtc_dsdt);
+
+/*
+ * Reserve the extended RTC I/O ports although they are not emulated at this
+ * time.
+ */
+SYSRES_IO(0x72, 6);
diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h
new file mode 100644
index 0000000000..1c108eed99
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.h
@@ -0,0 +1,36 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RTC_H_
+#define _RTC_H_
+
+void rtc_init(struct vmctx *ctx, int use_localtime);
+
+#endif /* _RTC_H_ */
diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c
new file mode 100644
index 0000000000..35a41a0855
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.c
@@ -0,0 +1,907 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <md5.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <uuid.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "smbiostbl.h"
+
+#define MB (1024*1024)
+#define GB (1024ULL*1024*1024)
+
+#define SMBIOS_BASE 0xF1000
+
+/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
+#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000)
+
+#define SMBIOS_TYPE_BIOS 0
+#define SMBIOS_TYPE_SYSTEM 1
+#define SMBIOS_TYPE_CHASSIS 3
+#define SMBIOS_TYPE_PROCESSOR 4
+#define SMBIOS_TYPE_MEMARRAY 16
+#define SMBIOS_TYPE_MEMDEVICE 17
+#define SMBIOS_TYPE_MEMARRAYMAP 19
+#define SMBIOS_TYPE_BOOT 32
+#define SMBIOS_TYPE_EOT 127
+
+struct smbios_structure {
+ uint8_t type;
+ uint8_t length;
+ uint16_t handle;
+} __packed;
+
+typedef int (*initializer_func_t)(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_template_entry {
+ struct smbios_structure *entry;
+ const char **strings;
+ initializer_func_t initializer;
+};
+
+/*
+ * SMBIOS Structure Table Entry Point
+ */
+#define SMBIOS_ENTRY_EANCHOR "_SM_"
+#define SMBIOS_ENTRY_EANCHORLEN 4
+#define SMBIOS_ENTRY_IANCHOR "_DMI_"
+#define SMBIOS_ENTRY_IANCHORLEN 5
+
+struct smbios_entry_point {
+ char eanchor[4]; /* anchor tag */
+ uint8_t echecksum; /* checksum of entry point structure */
+ uint8_t eplen; /* length in bytes of entry point */
+ uint8_t major; /* major version of the SMBIOS spec */
+ uint8_t minor; /* minor version of the SMBIOS spec */
+ uint16_t maxssize; /* maximum size in bytes of a struct */
+ uint8_t revision; /* entry point structure revision */
+ uint8_t format[5]; /* entry point rev-specific data */
+ char ianchor[5]; /* intermediate anchor tag */
+ uint8_t ichecksum; /* intermediate checksum */
+ uint16_t stlen; /* len in bytes of structure table */
+ uint32_t staddr; /* physical addr of structure table */
+ uint16_t stnum; /* number of structure table entries */
+ uint8_t bcdrev; /* BCD value representing DMI ver */
+} __packed;
+
+/*
+ * BIOS Information
+ */
+#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */
+#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */
+#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */
+#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */
+#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */
+#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */
+
+#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */
+
+#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */
+#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */
+
+struct smbios_table_type0 {
+ struct smbios_structure header;
+ uint8_t vendor; /* vendor string */
+ uint8_t version; /* version string */
+ uint16_t segment; /* address segment location */
+ uint8_t rel_date; /* release date */
+ uint8_t size; /* rom size */
+ uint64_t cflags; /* characteristics */
+ uint8_t xc_bytes[2]; /* characteristics ext bytes */
+ uint8_t sb_major_rel; /* system bios version */
+ uint8_t sb_minor_rele;
+ uint8_t ecfw_major_rel; /* embedded ctrl fw version */
+ uint8_t ecfw_minor_rel;
+} __packed;
+
+/*
+ * System Information
+ */
+#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */
+
+struct smbios_table_type1 {
+ struct smbios_structure header;
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t product; /* product name string */
+ uint8_t version; /* version string */
+ uint8_t serial; /* serial number string */
+ uint8_t uuid[16]; /* uuid byte array */
+ uint8_t wakeup; /* wake-up event */
+ uint8_t sku; /* sku number string */
+ uint8_t family; /* family name string */
+} __packed;
+
+/*
+ * System Enclosure or Chassis
+ */
+#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_CHST_SAFE 0x03 /* safe */
+
+#define SMBIOS_CHSC_NONE 0x03 /* none */
+
+struct smbios_table_type3 {
+ struct smbios_structure header;
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t type; /* type */
+ uint8_t version; /* version string */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t bustate; /* boot-up state */
+ uint8_t psstate; /* power supply state */
+ uint8_t tstate; /* thermal state */
+ uint8_t security; /* security status */
+ uint8_t uheight; /* height in 'u's */
+ uint8_t cords; /* number of power cords */
+ uint8_t elems; /* number of element records */
+ uint8_t elemlen; /* length of records */
+ uint8_t sku; /* sku number string */
+} __packed;
+
+/*
+ * Processor Information
+ */
+#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */
+
+#define SMBIOS_PRF_OTHER 0x01 /* other */
+
+#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */
+#define SMBIOS_PRS_ENABLED 0x1 /* enabled */
+
+#define SMBIOS_PRU_NONE 0x06 /* none */
+
+#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */
+
+struct smbios_table_type4 {
+ struct smbios_structure header;
+ uint8_t socket; /* socket designation string */
+ uint8_t type; /* processor type */
+ uint8_t family; /* processor family */
+ uint8_t manufacturer; /* manufacturer string */
+ uint64_t cpuid; /* processor cpuid */
+ uint8_t version; /* version string */
+ uint8_t voltage; /* voltage */
+ uint16_t clkspeed; /* ext clock speed in mhz */
+ uint16_t maxspeed; /* maximum speed in mhz */
+ uint16_t curspeed; /* current speed in mhz */
+ uint8_t status; /* status */
+ uint8_t upgrade; /* upgrade */
+ uint16_t l1handle; /* l1 cache handle */
+ uint16_t l2handle; /* l2 cache handle */
+ uint16_t l3handle; /* l3 cache handle */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t part; /* part number string */
+ uint8_t cores; /* cores per socket */
+ uint8_t ecores; /* enabled cores */
+ uint8_t threads; /* threads per socket */
+ uint16_t cflags; /* processor characteristics */
+ uint16_t family2; /* processor family 2 */
+} __packed;
+
+/*
+ * Physical Memory Array
+ */
+#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */
+
+#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */
+
+#define SMBIOS_MAE_NONE 0x03 /* none */
+
+struct smbios_table_type16 {
+ struct smbios_structure header;
+ uint8_t location; /* physical device location */
+ uint8_t use; /* device functional purpose */
+ uint8_t ecc; /* err detect/correct method */
+ uint32_t size; /* max mem capacity in kb */
+ uint16_t errhand; /* handle of error (if any) */
+ uint16_t ndevs; /* num of slots or sockets */
+ uint64_t xsize; /* max mem capacity in bytes */
+} __packed;
+
+/*
+ * Memory Device
+ */
+#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */
+
+struct smbios_table_type17 {
+ struct smbios_structure header;
+ uint16_t arrayhand; /* handle of physl mem array */
+ uint16_t errhand; /* handle of mem error data */
+ uint16_t twidth; /* total width in bits */
+ uint16_t dwidth; /* data width in bits */
+ uint16_t size; /* size in bytes */
+ uint8_t form; /* form factor */
+ uint8_t set; /* set */
+ uint8_t dloc; /* device locator string */
+ uint8_t bloc; /* phys bank locator string */
+ uint8_t type; /* memory type */
+ uint16_t flags; /* memory characteristics */
+ uint16_t maxspeed; /* maximum speed in mhz */
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t part; /* part number string */
+ uint8_t attributes; /* attributes */
+ uint32_t xsize; /* extended size in mbs */
+ uint16_t curspeed; /* current speed in mhz */
+ uint16_t minvoltage; /* minimum voltage */
+ uint16_t maxvoltage; /* maximum voltage */
+ uint16_t curvoltage; /* configured voltage */
+} __packed;
+
+/*
+ * Memory Array Mapped Address
+ */
+struct smbios_table_type19 {
+ struct smbios_structure header;
+ uint32_t saddr; /* start phys addr in kb */
+ uint32_t eaddr; /* end phys addr in kb */
+ uint16_t arrayhand; /* physical mem array handle */
+ uint8_t width; /* num of dev in row */
+ uint64_t xsaddr; /* start phys addr in bytes */
+ uint64_t xeaddr; /* end phys addr in bytes */
+} __packed;
+
+/*
+ * System Boot Information
+ */
+#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */
+
+struct smbios_table_type32 {
+ struct smbios_structure header;
+ uint8_t reserved[6];
+ uint8_t status; /* boot status */
+} __packed;
+
+/*
+ * End-of-Table
+ */
+struct smbios_table_type127 {
+ struct smbios_structure header;
+} __packed;
+
+struct smbios_table_type0 smbios_type0_template = {
+ { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 },
+ 1, /* bios vendor string */
+ 2, /* bios version string */
+ 0xF000, /* bios address segment location */
+ 3, /* bios release date */
+ 0x0, /* bios size (64k * (n + 1) is the size in bytes) */
+ SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW |
+ SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD,
+ { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM },
+ 0x0, /* bios major release */
+ 0x0, /* bios minor release */
+ 0xff, /* embedded controller firmware major release */
+ 0xff /* embedded controller firmware minor release */
+};
+
+const char *smbios_type0_strings[] = {
+ "BHYVE", /* vendor string */
+ "1.00", /* bios version string */
+ "03/14/2014", /* bios release date string */
+ NULL
+};
+
+struct smbios_table_type1 smbios_type1_template = {
+ { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 },
+ 1, /* manufacturer string */
+ 2, /* product string */
+ 3, /* version string */
+ 4, /* serial number string */
+ { 0 },
+ SMBIOS_WAKEUP_SWITCH,
+ 5, /* sku string */
+ 6 /* family string */
+};
+
+static int smbios_type1_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+const char *smbios_type1_strings[] = {
+ " ", /* manufacturer string */
+ "BHYVE", /* product name string */
+ "1.0", /* version string */
+ "None", /* serial number string */
+ "None", /* sku string */
+ " ", /* family name string */
+ NULL
+};
+
+struct smbios_table_type3 smbios_type3_template = {
+ { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 },
+ 1, /* manufacturer string */
+ SMBIOS_CHT_UNKNOWN,
+ 2, /* version string */
+ 3, /* serial number string */
+ 4, /* asset tag string */
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHSC_NONE,
+ 0, /* height in 'u's (0=enclosure height unspecified) */
+ 0, /* number of power cords (0=number unspecified) */
+ 0, /* number of contained element records */
+ 0, /* length of records */
+ 5 /* sku number string */
+};
+
+const char *smbios_type3_strings[] = {
+ " ", /* manufacturer string */
+ "1.0", /* version string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* sku number string */
+ NULL
+};
+
+struct smbios_table_type4 smbios_type4_template = {
+ { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 },
+ 1, /* socket designation string */
+ SMBIOS_PRT_CENTRAL,
+ SMBIOS_PRF_OTHER,
+ 2, /* manufacturer string */
+ 0, /* cpuid */
+ 3, /* version string */
+ 0, /* voltage */
+ 0, /* external clock frequency in mhz (0=unknown) */
+ 0, /* maximum frequency in mhz (0=unknown) */
+ 0, /* current frequency in mhz (0=unknown) */
+ SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED,
+ SMBIOS_PRU_NONE,
+ -1, /* l1 cache handle */
+ -1, /* l2 cache handle */
+ -1, /* l3 cache handle */
+ 4, /* serial number string */
+ 5, /* asset tag string */
+ 6, /* part number string */
+ 0, /* cores per socket (0=unknown) */
+ 0, /* enabled cores per socket (0=unknown) */
+ 0, /* threads per socket (0=unknown) */
+ SMBIOS_PFL_64B,
+ SMBIOS_PRF_OTHER
+};
+
+const char *smbios_type4_strings[] = {
+ " ", /* socket designation string */
+ " ", /* manufacturer string */
+ " ", /* version string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* part number string */
+ NULL
+};
+
+static int smbios_type4_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type16 smbios_type16_template = {
+ { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 },
+ SMBIOS_MAL_SYSMB,
+ SMBIOS_MAU_SYSTEM,
+ SMBIOS_MAE_NONE,
+ 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */
+ -1, /* handle of error (if any) */
+ 0, /* number of slots or sockets (TBD) */
+ 0 /* extended maximum memory capacity in bytes (TBD) */
+};
+
+static int smbios_type16_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type17 smbios_type17_template = {
+ { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 },
+ -1, /* handle of physical memory array */
+ -1, /* handle of memory error data */
+ 64, /* total width in bits including ecc */
+ 64, /* data width in bits */
+ 0x7fff, /* size in bytes (0x7fff=use extended)*/
+ SMBIOS_MDFF_UNKNOWN,
+ 0, /* set (0x00=none, 0xff=unknown) */
+ 1, /* device locator string */
+ 2, /* physical bank locator string */
+ SMBIOS_MDT_UNKNOWN,
+ SMBIOS_MDF_UNKNOWN,
+ 0, /* maximum memory speed in mhz (0=unknown) */
+ 3, /* manufacturer string */
+ 4, /* serial number string */
+ 5, /* asset tag string */
+ 6, /* part number string */
+ 0, /* attributes (0=unknown rank information) */
+ 0, /* extended size in mb (TBD) */
+ 0, /* current speed in mhz (0=unknown) */
+ 0, /* minimum voltage in mv (0=unknown) */
+ 0, /* maximum voltage in mv (0=unknown) */
+ 0 /* configured voltage in mv (0=unknown) */
+};
+
+const char *smbios_type17_strings[] = {
+ " ", /* device locator string */
+ " ", /* physical bank locator string */
+ " ", /* manufacturer string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* part number string */
+ NULL
+};
+
+static int smbios_type17_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type19 smbios_type19_template = {
+ { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 },
+ 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */
+ 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */
+ -1, /* physical memory array handle */
+ 1, /* number of devices that form a row */
+ 0, /* extended starting phys addr in bytes (TDB) */
+ 0 /* extended ending phys addr in bytes (TDB) */
+};
+
+static int smbios_type19_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type32 smbios_type32_template = {
+ { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 },
+ { 0, 0, 0, 0, 0, 0 },
+ SMBIOS_BOOT_NORMAL
+};
+
+struct smbios_table_type127 smbios_type127_template = {
+ { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 }
+};
+
+static int smbios_generic_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+static struct smbios_template_entry smbios_template[] = {
+ { (struct smbios_structure *)&smbios_type0_template,
+ smbios_type0_strings,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type1_template,
+ smbios_type1_strings,
+ smbios_type1_initializer },
+ { (struct smbios_structure *)&smbios_type3_template,
+ smbios_type3_strings,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type4_template,
+ smbios_type4_strings,
+ smbios_type4_initializer },
+ { (struct smbios_structure *)&smbios_type16_template,
+ NULL,
+ smbios_type16_initializer },
+ { (struct smbios_structure *)&smbios_type17_template,
+ smbios_type17_strings,
+ smbios_type17_initializer },
+ { (struct smbios_structure *)&smbios_type19_template,
+ NULL,
+ smbios_type19_initializer },
+ { (struct smbios_structure *)&smbios_type32_template,
+ NULL,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type127_template,
+ NULL,
+ smbios_generic_initializer },
+ { NULL,NULL, NULL }
+};
+
+static uint64_t guest_lomem, guest_himem;
+static uint16_t type16_handle;
+
+static int
+smbios_generic_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_structure *entry;
+
+ memcpy(curaddr, template_entry, template_entry->length);
+ entry = (struct smbios_structure *)curaddr;
+ entry->handle = *n + 1;
+ curaddr += entry->length;
+ if (template_strings != NULL) {
+ int i;
+
+ for (i = 0; template_strings[i] != NULL; i++) {
+ const char *string;
+ int len;
+
+ string = template_strings[i];
+ len = strlen(string) + 1;
+ memcpy(curaddr, string, len);
+ curaddr += len;
+ }
+ *curaddr = '\0';
+ curaddr++;
+ } else {
+ /* Minimum string section is double nul */
+ *curaddr = '\0';
+ curaddr++;
+ *curaddr = '\0';
+ curaddr++;
+ }
+ (*n)++;
+ *endaddr = curaddr;
+
+ return (0);
+}
+
+static int
+smbios_type1_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type1 *type1;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type1 = (struct smbios_table_type1 *)curaddr;
+
+ if (guest_uuid_str != NULL) {
+ uuid_t uuid;
+ uint32_t status;
+
+ uuid_from_string(guest_uuid_str, &uuid, &status);
+ if (status != uuid_s_ok)
+ return (-1);
+
+ uuid_enc_le(&type1->uuid, &uuid);
+ } else {
+ MD5_CTX mdctx;
+ u_char digest[16];
+ char hostname[MAXHOSTNAMELEN];
+
+ /*
+ * Universally unique and yet reproducible are an
+ * oxymoron, however reproducible is desirable in
+ * this case.
+ */
+ if (gethostname(hostname, sizeof(hostname)))
+ return (-1);
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, vmname, strlen(vmname));
+ MD5Update(&mdctx, hostname, sizeof(hostname));
+ MD5Final(digest, &mdctx);
+
+ /*
+ * Set the variant and version number.
+ */
+ digest[6] &= 0x0F;
+ digest[6] |= 0x30; /* version 3 */
+ digest[8] &= 0x3F;
+ digest[8] |= 0x80;
+
+ memcpy(&type1->uuid, digest, sizeof (digest));
+ }
+
+ return (0);
+}
+
+static int
+smbios_type4_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ int i;
+
+ for (i = 0; i < guest_ncpus; i++) {
+ struct smbios_table_type4 *type4;
+ char *p;
+ int nstrings, len;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type4 = (struct smbios_table_type4 *)curaddr;
+ p = curaddr + sizeof (struct smbios_table_type4);
+ nstrings = 0;
+ while (p < *endaddr - 1) {
+ if (*p++ == '\0')
+ nstrings++;
+ }
+ len = sprintf(*endaddr - 1, "CPU #%d", i) + 1;
+ *endaddr += len - 1;
+ *(*endaddr) = '\0';
+ (*endaddr)++;
+ type4->socket = nstrings + 1;
+ curaddr = *endaddr;
+ }
+
+ return (0);
+}
+
+static int
+smbios_type16_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type16 *type16;
+
+ type16_handle = *n;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type16 = (struct smbios_table_type16 *)curaddr;
+ type16->xsize = guest_lomem + guest_himem;
+ type16->ndevs = guest_himem > 0 ? 2 : 1;
+
+ return (0);
+}
+
+static int
+smbios_type17_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type17 *type17;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type17 = (struct smbios_table_type17 *)curaddr;
+ type17->arrayhand = type16_handle;
+ type17->xsize = guest_lomem;
+
+ if (guest_himem > 0) {
+ curaddr = *endaddr;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type17 = (struct smbios_table_type17 *)curaddr;
+ type17->arrayhand = type16_handle;
+ type17->xsize = guest_himem;
+ }
+
+ return (0);
+}
+
+static int
+smbios_type19_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type19 *type19;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type19 = (struct smbios_table_type19 *)curaddr;
+ type19->arrayhand = type16_handle;
+ type19->xsaddr = 0;
+ type19->xeaddr = guest_lomem;
+
+ if (guest_himem > 0) {
+ curaddr = *endaddr;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type19 = (struct smbios_table_type19 *)curaddr;
+ type19->arrayhand = type16_handle;
+ type19->xsaddr = 4*GB;
+ type19->xeaddr = guest_himem;
+ }
+
+ return (0);
+}
+
+static void
+smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr)
+{
+ memset(smbios_ep, 0, sizeof(*smbios_ep));
+ memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR,
+ SMBIOS_ENTRY_EANCHORLEN);
+ smbios_ep->eplen = 0x1F;
+ assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen);
+ smbios_ep->major = 2;
+ smbios_ep->minor = 6;
+ smbios_ep->revision = 0;
+ memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR,
+ SMBIOS_ENTRY_IANCHORLEN);
+ smbios_ep->staddr = staddr;
+ smbios_ep->bcdrev = 0x24;
+}
+
+static void
+smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len,
+ uint16_t num, uint16_t maxssize)
+{
+ uint8_t checksum;
+ int i;
+
+ smbios_ep->maxssize = maxssize;
+ smbios_ep->stlen = len;
+ smbios_ep->stnum = num;
+
+ checksum = 0;
+ for (i = 0x10; i < 0x1f; i++) {
+ checksum -= ((uint8_t *)smbios_ep)[i];
+ }
+ smbios_ep->ichecksum = checksum;
+
+ checksum = 0;
+ for (i = 0; i < 0x1f; i++) {
+ checksum -= ((uint8_t *)smbios_ep)[i];
+ }
+ smbios_ep->echecksum = checksum;
+}
+
+int
+smbios_build(struct vmctx *ctx)
+{
+ struct smbios_entry_point *smbios_ep;
+ uint16_t n;
+ uint16_t maxssize;
+ char *curaddr, *startaddr, *ststartaddr;
+ int i;
+ int err;
+
+ guest_lomem = vm_get_lowmem_size(ctx);
+ guest_himem = vm_get_highmem_size(ctx);
+
+ startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
+ if (startaddr == NULL) {
+ fprintf(stderr, "smbios table requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ curaddr = startaddr;
+
+ smbios_ep = (struct smbios_entry_point *)curaddr;
+ smbios_ep_initializer(smbios_ep, SMBIOS_BASE +
+ sizeof(struct smbios_entry_point));
+ curaddr += sizeof(struct smbios_entry_point);
+ ststartaddr = curaddr;
+
+ n = 0;
+ maxssize = 0;
+ for (i = 0; smbios_template[i].entry != NULL; i++) {
+ struct smbios_structure *entry;
+ const char **strings;
+ initializer_func_t initializer;
+ char *endaddr;
+ uint16_t size;
+
+ entry = smbios_template[i].entry;
+ strings = smbios_template[i].strings;
+ initializer = smbios_template[i].initializer;
+
+ err = (*initializer)(entry, strings, curaddr, &endaddr,
+ &n, &size);
+ if (err != 0)
+ return (err);
+
+ if (size > maxssize)
+ maxssize = size;
+
+ curaddr = endaddr;
+ }
+
+ assert(curaddr - startaddr < SMBIOS_MAX_LENGTH);
+ smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize);
+
+ return (0);
+}
+
+int
+smbios_parse(const char *opts)
+{
+ char *buf;
+ char *lasts;
+ char *token;
+ char *end;
+ long type;
+ struct {
+ const char *key;
+ const char **targetp;
+ } type1_map[] = {
+ { "manufacturer", &smbios_type1_strings[0] },
+ { "product", &smbios_type1_strings[1] },
+ { "version", &smbios_type1_strings[2] },
+ { "serial", &smbios_type1_strings[3] },
+ { "sku", &smbios_type1_strings[4] },
+ { "family", &smbios_type1_strings[5] },
+ { "uuid", (const char **)&guest_uuid_str },
+ { 0 }
+ };
+
+ if ((buf = strdup(opts)) == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ return (-1);
+ }
+
+ if ((token = strtok_r(buf, ",", &lasts)) == NULL) {
+ (void) fprintf(stderr, "too few fields\n");
+ goto fail;
+ }
+
+ errno = 0;
+ type = strtol(token, &end, 10);
+ if (errno != 0 || *end != '\0') {
+ (void) fprintf(stderr, "first token '%s' is not an integer\n",
+ token);
+ goto fail;
+ }
+
+ /* For now, only type 1 is supported. */
+ if (type != 1) {
+ (void) fprintf(stderr, "unsupported type %d\n", type);
+ goto fail;
+ }
+
+ while ((token = strtok_r(NULL, ",", &lasts)) != NULL) {
+ char *val;
+ int i;
+
+ if ((val = strchr(token, '=')) == NULL) {
+ (void) fprintf(stderr, "invalid key=value: '%s'\n",
+ token);
+ goto fail;
+ }
+ *val = '\0';
+ val++;
+
+ for (i = 0; type1_map[i].key != NULL; i++) {
+ if (strcmp(token, type1_map[i].key) == 0) {
+ break;
+ }
+ }
+ if (type1_map[i].key == NULL) {
+ (void) fprintf(stderr, "invalid key '%s'\n", token);
+ goto fail;
+ }
+ *type1_map[i].targetp = val;
+ }
+
+ return (0);
+
+fail:
+ free(buf);
+ return (-1);
+}
diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h
new file mode 100644
index 0000000000..81e26309e5
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _SMBIOSTBL_H_
+#define _SMBIOSTBL_H_
+
+struct vmctx;
+
+int smbios_build(struct vmctx *ctx);
+int smbios_parse(const char *opts);
+
+#endif /* _SMBIOSTBL_H_ */
diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c
new file mode 100644
index 0000000000..b592bce9aa
--- /dev/null
+++ b/usr/src/cmd/bhyve/sockstream.c
@@ -0,0 +1,86 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <errno.h>
+
+#include "sockstream.h"
+
+ssize_t
+stream_read(int fd, void *buf, ssize_t nbytes)
+{
+ uint8_t *p;
+ ssize_t len = 0;
+ ssize_t n;
+
+ p = buf;
+
+ while (len < nbytes) {
+ n = read(fd, p + len, nbytes - len);
+ if (n == 0)
+ break;
+
+ if (n < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return (n);
+ }
+ len += n;
+ }
+ return (len);
+}
+
+ssize_t
+stream_write(int fd, const void *buf, ssize_t nbytes)
+{
+ const uint8_t *p;
+ ssize_t len = 0;
+ ssize_t n;
+
+ p = buf;
+
+ while (len < nbytes) {
+ n = write(fd, p + len, nbytes - len);
+ if (n == 0)
+ break;
+ if (n < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return (n);
+ }
+ len += n;
+ }
+ return (len);
+}
diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h
new file mode 100644
index 0000000000..ecea849471
--- /dev/null
+++ b/usr/src/cmd/bhyve/sockstream.h
@@ -0,0 +1,35 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t stream_read(int fd, void *buf, ssize_t nbytes);
+ssize_t stream_write(int fd, const void *buf, ssize_t nbytes);
diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c
new file mode 100644
index 0000000000..ecdd05694c
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.c
@@ -0,0 +1,110 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+ int vector, error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ vector = *rip >> PAGE_SHIFT;
+ *rip = 0;
+
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+ int error;
+
+ assert(newcpu != 0);
+ assert(newcpu < guest_ncpus);
+
+ error = vcpu_reset(ctx, newcpu);
+ assert(error == 0);
+
+ fbsdrun_set_capabilities(ctx, newcpu);
+
+ /*
+ * Enable the 'unrestricted guest' mode for 'newcpu'.
+ *
+ * Set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ */
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ spinup_ap_realmode(ctx, newcpu, &rip);
+
+#ifdef __FreeBSD__
+ fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
+#else
+ fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false);
+#endif
+
+ return (newcpu);
+}
diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h
new file mode 100644
index 0000000000..226542f6c3
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.h
@@ -0,0 +1,36 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SPINUP_AP_H_
+#define _SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c
new file mode 100644
index 0000000000..b5950a19d8
--- /dev/null
+++ b/usr/src/cmd/bhyve/task_switch.c
@@ -0,0 +1,941 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+ uint16_t tss_link;
+ uint16_t rsvd1;
+ uint32_t tss_esp0;
+ uint16_t tss_ss0;
+ uint16_t rsvd2;
+ uint32_t tss_esp1;
+ uint16_t tss_ss1;
+ uint16_t rsvd3;
+ uint32_t tss_esp2;
+ uint16_t tss_ss2;
+ uint16_t rsvd4;
+ uint32_t tss_cr3;
+ uint32_t tss_eip;
+ uint32_t tss_eflags;
+ uint32_t tss_eax;
+ uint32_t tss_ecx;
+ uint32_t tss_edx;
+ uint32_t tss_ebx;
+ uint32_t tss_esp;
+ uint32_t tss_ebp;
+ uint32_t tss_esi;
+ uint32_t tss_edi;
+ uint16_t tss_es;
+ uint16_t rsvd5;
+ uint16_t tss_cs;
+ uint16_t rsvd6;
+ uint16_t tss_ss;
+ uint16_t rsvd7;
+ uint16_t tss_ds;
+ uint16_t rsvd8;
+ uint16_t tss_fs;
+ uint16_t rsvd9;
+ uint16_t tss_gs;
+ uint16_t rsvd10;
+ uint16_t tss_ldt;
+ uint16_t rsvd11;
+ uint16_t tss_trap;
+ uint16_t tss_iomap;
+};
+static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
+
+#define SEL_START(sel) (((sel) & ~0x7))
+#define SEL_LIMIT(sel) (((sel) | 0x7))
+#define TSS_BUSY(type) (((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+ uint64_t val;
+ int error;
+
+ error = vm_get_register(ctx, vcpu, reg, &val);
+ assert(error == 0);
+ return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+ int error;
+
+ error = vm_set_register(ctx, vcpu, reg, val);
+ assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+ struct seg_desc seg_desc;
+
+ seg_desc.base = (u_int)USD_GETBASE(usd);
+ if (usd->sd_gran)
+ seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+ else
+ seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+ seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+ seg_desc.access |= usd->sd_xx << 12;
+ seg_desc.access |= usd->sd_def32 << 14;
+ seg_desc.access |= usd->sd_gran << 15;
+
+ return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+ /*
+ * Bit 2 from the selector is retained as-is in the error code.
+ *
+ * Bit 1 can be safely cleared because none of the selectors
+ * encountered during task switch emulation refer to a task
+ * gate in the IDT.
+ *
+ * Bit 0 is set depending on the value of 'ext'.
+ */
+ sel &= ~0x3;
+ if (ext)
+ sel |= 0x1;
+ vm_inject_fault(ctx, vcpu, vector, 1, sel);
+}
+
+/*
+ * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
+ * and non-zero otherwise.
+ */
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+ uint64_t base;
+ uint32_t limit, access;
+ int error, reg;
+
+ reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+ error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+ assert(error == 0);
+
+ if (reg == VM_REG_GUEST_LDTR) {
+ if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+ return (-1);
+ }
+
+ if (limit < SEL_LIMIT(sel))
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
+ * by the selector 'sel'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc, bool doread,
+ int *faultptr)
+{
+ struct iovec iov[2];
+ uint64_t base;
+ uint32_t limit, access;
+ int error, reg;
+
+ reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+ error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+ assert(error == 0);
+ assert(limit >= SEL_LIMIT(sel));
+
+ error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
+ sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+
+ if (doread)
+ vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+ else
+ vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+ return (0);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+ return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+ return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
+}
+
+/*
+ * Read the TSS descriptor referenced by 'sel' into 'desc'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+ struct vm_guest_paging sup_paging;
+ int error;
+
+ assert(!ISLDT(sel));
+ assert(IDXSEL(sel) != 0);
+
+ /* Fetch the new TSS descriptor */
+ if (desc_table_limit_check(ctx, vcpu, sel)) {
+ if (ts->reason == TSR_IRET)
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ else
+ sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+ return (1);
+ }
+
+ sup_paging = ts->paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+ error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
+ return (error);
+}
+
+static bool
+code_desc(int sd_type)
+{
+ /* code descriptor */
+ return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+ /* writable data descriptor */
+ return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+ /* data descriptor or a readable code descriptor */
+ return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+ return (sd_type == SDT_SYSLDT);
+}
+
+/*
+ * Validate the descriptor 'seg_desc' associated with 'segment'.
+ */
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ int segment, struct seg_desc *seg_desc, int *faultptr)
+{
+ struct vm_guest_paging sup_paging;
+ struct user_segment_descriptor usd;
+ int error, idtvec;
+ int cpl, dpl, rpl;
+ uint16_t sel, cs;
+ bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+ ldtseg = codeseg = stackseg = dataseg = false;
+ switch (segment) {
+ case VM_REG_GUEST_LDTR:
+ ldtseg = true;
+ break;
+ case VM_REG_GUEST_CS:
+ codeseg = true;
+ break;
+ case VM_REG_GUEST_SS:
+ stackseg = true;
+ break;
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ dataseg = true;
+ break;
+ default:
+ assert(0);
+ }
+
+ /* Get the segment selector */
+ sel = GETREG(ctx, vcpu, segment);
+
+ /* LDT selector must point into the GDT */
+ if (ldtseg && ISLDT(sel)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* Descriptor table limit check */
+ if (desc_table_limit_check(ctx, vcpu, sel)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* NULL selector */
+ if (IDXSEL(sel) == 0) {
+ /* Code and stack segment selectors cannot be NULL */
+ if (codeseg || stackseg) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ seg_desc->base = 0;
+ seg_desc->limit = 0;
+ seg_desc->access = 0x10000; /* unusable */
+ return (0);
+ }
+
+ /* Read the descriptor from the GDT/LDT */
+ sup_paging = ts->paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+ error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
+ if (error || *faultptr)
+ return (error);
+
+ /* Verify that the descriptor type is compatible with the segment */
+ if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+ (codeseg && !code_desc(usd.sd_type)) ||
+ (dataseg && !data_desc(usd.sd_type)) ||
+ (stackseg && !stack_desc(usd.sd_type))) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* Segment must be marked present */
+ if (!usd.sd_p) {
+ if (ldtseg)
+ idtvec = IDT_TS;
+ else if (stackseg)
+ idtvec = IDT_SS;
+ else
+ idtvec = IDT_NP;
+ sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+ return (1);
+ }
+
+ cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+ cpl = cs & SEL_RPL_MASK;
+ rpl = sel & SEL_RPL_MASK;
+ dpl = usd.sd_dpl;
+
+ if (stackseg && (rpl != cpl || dpl != cpl)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ if (codeseg) {
+ conforming = (usd.sd_type & 0x4) ? true : false;
+ if ((conforming && (cpl < dpl)) ||
+ (!conforming && (cpl != dpl))) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ }
+
+ if (dataseg) {
+ /*
+ * A data segment is always non-conforming except when it's
+ * descriptor is a readable, conforming code segment.
+ */
+ if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+ conforming = true;
+ else
+ conforming = false;
+
+ if (!conforming && (rpl > dpl || cpl > dpl)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ }
+ *seg_desc = usd_to_seg_desc(&usd);
+ return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+ uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+ /* General purpose registers */
+ tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+ tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+ tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+ tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+ tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+ tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+ tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+ tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+ /* Segment selectors */
+ tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+ tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+ tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+ tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+ tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+ tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+ /* eflags and eip */
+ tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+ if (task_switch->reason == TSR_IRET)
+ tss->tss_eflags &= ~PSL_NT;
+ tss->tss_eip = eip;
+
+ /* Copy updated old TSS into guest memory */
+ vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+ int error;
+
+ error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+ assert(error == 0);
+}
+
+/*
+ * Update the vcpu registers to reflect the state of the new task.
+ */
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
+{
+ struct seg_desc seg_desc, seg_desc2;
+ uint64_t *pdpte, maxphyaddr, reserved;
+ uint32_t eflags;
+ int error, i;
+ bool nested;
+
+ nested = false;
+ if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+ tss->tss_link = ot_sel;
+ nested = true;
+ }
+
+ eflags = tss->tss_eflags;
+ if (nested)
+ eflags |= PSL_NT;
+
+ /* LDTR */
+ SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+ /* PBDR */
+ if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+ if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+ /*
+ * XXX Assuming 36-bit MAXPHYADDR.
+ */
+ maxphyaddr = (1UL << 36) - 1;
+ pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+ for (i = 0; i < 4; i++) {
+ /* Check reserved bits if the PDPTE is valid */
+ if (!(pdpte[i] & 0x1))
+ continue;
+ /*
+ * Bits 2:1, 8:5 and bits above the processor's
+ * maximum physical address are reserved.
+ */
+ reserved = ~maxphyaddr | 0x1E6;
+ if (pdpte[i] & reserved) {
+ vm_inject_gp(ctx, vcpu);
+ return (1);
+ }
+ }
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+ }
+ SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+ ts->paging.cr3 = tss->tss_cr3;
+ }
+
+ /* eflags and eip */
+ SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+ /* General purpose registers */
+ SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+ /* Segment selectors */
+ SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+ SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+ SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+ SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+ SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+ SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+ /*
+ * If this is a nested task then write out the new TSS to update
+ * the previous link field.
+ */
+ if (nested)
+ vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+ /* Validate segment descriptors */
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+ /*
+ * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+ *
+ * The SS and CS attribute checks on VM-entry are inter-dependent so
+ * we need to make sure that both segments are valid before updating
+ * either of them. This ensures that the VMCS state can pass the
+ * VM-entry checks so the guest can handle any exception injected
+ * during task switch emulation.
+ */
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+ ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
+ faultptr);
+ if (error || *faultptr)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+ return (0);
+}
+
+/*
+ * Push an error code on the stack of the new task. This is needed if the
+ * task switch was triggered by a hardware exception that causes an error
+ * code to be saved (e.g. #PF).
+ */
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ int task_type, uint32_t errcode, int *faultptr)
+{
+ struct iovec iov[2];
+ struct seg_desc seg_desc;
+ int stacksize, bytes, error;
+ uint64_t gla, cr0, rflags;
+ uint32_t esp;
+ uint16_t stacksel;
+
+ *faultptr = 0;
+
+ cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+ rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+ stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+ &seg_desc.limit, &seg_desc.access);
+ assert(error == 0);
+
+ /*
+ * Section "Error Code" in the Intel SDM vol 3: the error code is
+ * pushed on the stack as a doubleword or word (depending on the
+ * default interrupt, trap or task gate size).
+ */
+ if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+ bytes = 4;
+ else
+ bytes = 2;
+
+ /*
+ * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+ * stack-segment descriptor determines the size of the stack
+ * pointer outside of 64-bit mode.
+ */
+ if (SEG_DESC_DEF32(seg_desc.access))
+ stacksize = 4;
+ else
+ stacksize = 2;
+
+ esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+ esp -= bytes;
+
+ if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+ &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+ sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+ *faultptr = 1;
+ return (0);
+ }
+
+ if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+ vm_inject_ac(ctx, vcpu, 1);
+ *faultptr = 1;
+ return (0);
+ }
+
+ error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+ iov, nitems(iov), faultptr);
+ if (error || *faultptr)
+ return (error);
+
+ vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+ return (0);
+}
+
+/*
+ * Evaluate return value from helper functions and potentially return to
+ * the VM run loop.
+ */
+#define CHKERR(error,fault) \
+ do { \
+ assert((error == 0) || (error == EFAULT)); \
+ if (error) \
+ return (VMEXIT_ABORT); \
+ else if (fault) \
+ return (VMEXIT_CONTINUE); \
+ } while (0)
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ struct seg_desc nt;
+ struct tss32 oldtss, newtss;
+ struct vm_task_switch *task_switch;
+ struct vm_guest_paging *paging, sup_paging;
+ struct user_segment_descriptor nt_desc, ot_desc;
+ struct iovec nt_iov[2], ot_iov[2];
+ uint64_t cr0, ot_base;
+ uint32_t eip, ot_lim, access;
+ int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
+ enum task_switch_reason reason;
+ uint16_t nt_sel, ot_sel;
+
+ task_switch = &vmexit->u.task_switch;
+ nt_sel = task_switch->tsssel;
+ ext = vmexit->u.task_switch.ext;
+ reason = vmexit->u.task_switch.reason;
+ paging = &vmexit->u.task_switch.paging;
+ vcpu = *pvcpu;
+
+ assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+ /*
+ * Calculate the instruction pointer to store in the old TSS.
+ */
+ eip = vmexit->rip + vmexit->inst_length;
+
+ /*
+ * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+ * The following page table accesses are implicitly supervisor mode:
+ * - accesses to GDT or LDT to load segment descriptors
+ * - accesses to the task state segment during task switch
+ */
+ sup_paging = *paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+
+ /* Fetch the new TSS descriptor */
+ error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
+ &fault);
+ CHKERR(error, fault);
+
+ nt = usd_to_seg_desc(&nt_desc);
+
+ /* Verify the type of the new TSS */
+ nt_type = SEG_DESC_TYPE(nt.access);
+ if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+ nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /* TSS descriptor must have present bit set */
+ if (!SEG_DESC_PRESENT(nt.access)) {
+ sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+ goto done;
+ }
+
+ /*
+ * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+ * 44 bytes for a 16-bit TSS.
+ */
+ if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
+ minlimit = 104 - 1;
+ else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
+ minlimit = 44 - 1;
+ else
+ minlimit = 0;
+
+ assert(minlimit > 0);
+ if (nt.limit < minlimit) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /* TSS must be busy if task switch is due to IRET */
+ if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /*
+ * TSS must be available (not busy) if task switch reason is
+ * CALL, JMP, exception or interrupt.
+ */
+ if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
+ sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
+ goto done;
+ }
+
+ /* Fetch the new TSS */
+ error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+ PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
+ CHKERR(error, fault);
+ vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
+
+ /* Get the old TSS selector from the guest's task register */
+ ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
+ if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
+ /*
+ * This might happen if a task switch was attempted without
+ * ever loading the task register with LTR. In this case the
+ * TR would contain the values from power-on:
+ * (sel = 0, base = 0, limit = 0xffff).
+ */
+ sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
+ goto done;
+ }
+
+ /* Get the old TSS base and limit from the guest's task register */
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
+ &access);
+ assert(error == 0);
+ assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
+ ot_type = SEG_DESC_TYPE(access);
+ assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
+
+ /* Fetch the old TSS descriptor */
+ error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
+ &fault);
+ CHKERR(error, fault);
+
+ /* Get the old TSS */
+ error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+ PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
+ CHKERR(error, fault);
+ vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
+
+ /*
+ * Clear the busy bit in the old TSS descriptor if the task switch
+ * due to an IRET or JMP instruction.
+ */
+ if (reason == TSR_IRET || reason == TSR_JMP) {
+ ot_desc.sd_type &= ~0x2;
+ error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
+ &ot_desc, &fault);
+ CHKERR(error, fault);
+ }
+
+ if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
+ fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
+ return (VMEXIT_ABORT);
+ }
+
+ /* Save processor state in old TSS */
+ tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
+
+ /*
+ * If the task switch was triggered for any reason other than IRET
+ * then set the busy bit in the new TSS descriptor.
+ */
+ if (reason != TSR_IRET) {
+ nt_desc.sd_type |= 0x2;
+ error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
+ &nt_desc, &fault);
+ CHKERR(error, fault);
+ }
+
+ /* Update task register to point at the new TSS */
+ SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
+
+ /* Update the hidden descriptor state of the task register */
+ nt = usd_to_seg_desc(&nt_desc);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
+
+ /* Set CR0.TS */
+ cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+ SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
+
+ /*
+ * We are now committed to the task switch. Any exceptions encountered
+ * after this point will be handled in the context of the new task and
+ * the saved instruction pointer will belong to the new task.
+ */
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
+ assert(error == 0);
+
+ /* Load processor state from new TSS */
+ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
+ &fault);
+ CHKERR(error, fault);
+
+ /*
+ * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
+ * caused an error code to be generated, this error code is copied
+ * to the stack of the new task.
+ */
+ if (task_switch->errcode_valid) {
+ assert(task_switch->ext);
+ assert(task_switch->reason == TSR_IDT_GATE);
+ error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
+ task_switch->errcode, &fault);
+ CHKERR(error, fault);
+ }
+
+ /*
+ * Treatment of virtual-NMI blocking if NMI is delivered through
+ * a task gate.
+ *
+ * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
+ * If the virtual NMIs VM-execution control is 1, VM entry injects
+ * an NMI, and delivery of the NMI causes a task switch that causes
+ * a VM exit, virtual-NMI blocking is in effect before the VM exit
+ * commences.
+ *
+ * Thus, virtual-NMI blocking is in effect at the time of the task
+ * switch VM exit.
+ */
+
+ /*
+ * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
+ *
+ * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
+ * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
+ * This unblocking of virtual-NMI occurs even if IRET causes a fault.
+ *
+ * Thus, virtual-NMI blocking is cleared at the time of the task switch
+ * VM exit.
+ */
+
+ /*
+ * If the task switch was triggered by an event delivered through
+ * the IDT then extinguish the pending event from the vcpu's
+ * exitintinfo.
+ */
+ if (task_switch->reason == TSR_IDT_GATE) {
+ error = vm_set_intinfo(ctx, vcpu, 0);
+ assert(error == 0);
+ }
+
+ /*
+ * XXX should inject debug exception if 'T' bit is 1
+ */
+done:
+ return (VMEXIT_CONTINUE);
+}
diff --git a/usr/src/cmd/bhyve/test/Makefile b/usr/src/cmd/bhyve/test/Makefile
new file mode 100644
index 0000000000..7dbee0c5f3
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+SUBDIRS = scripts tst
+
+include Makefile.subdirs
diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com
new file mode 100644
index 0000000000..3c719bcea7
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.com
@@ -0,0 +1,60 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include $(SRC)/Makefile.master
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/cmd/Makefile.cmd.64
+
+#
+# Force c99 for everything
+#
+CSTD= $(CSTD_GNU99)
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+
+CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \
+ -_gcc=-Wno-parentheses
+CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \
+ -_gcc=-Wno-parentheses
+CPPFLAGS = -I$(SRC)/cmd/bhyve \
+ -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+ -I$(CONTRIB)/freebsd/dev/usb/controller \
+ -I$(CONTRIB)/freebsd/dev/mii \
+ $(CPPFLAGS.master) \
+ -I$(ROOT)/usr/platform/i86pc/include \
+ -I$(SRC)/uts/i86pc/io/vmm \
+ -I$(SRC)/uts/common \
+ -I$(SRC)/uts/i86pc \
+ -I$(SRC)/lib/libdladm/common \
+ -DWITHOUT_CAPSICUM
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+CLEANFILES += $(EXETESTS)
+CLOBBERFILES += $(ROOTTESTS)
+
+#
+# Install related definitions
+#
+ROOTOPTPKG = $(ROOT)/opt/bhyvetest
+ROOTBIN = $(ROOTOPTPKG)/bin
+ROOTTST = $(ROOTOPTPKG)/tst
+ROOTTSTDIR = $(ROOTTST)/$(TSTDIR)
+ROOTTSTEXES = $(EXETESTS:%=$(ROOTTSTDIR)/%)
+ROOTTSTSH = $(SHTESTS:%=$(ROOTTSTDIR)/%)
+ROOTOUT = $(OUTFILES:%=$(ROOTTSTDIR)/%)
+ROOTTESTS = $(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT)
+FILEMODE = 0555
+LDLIBS = $(LDLIBS.cmd)
+LINTEXE = $(EXETESTS:%.exe=%.exe.ln)
diff --git a/usr/src/cmd/bhyve/test/Makefile.subdirs b/usr/src/cmd/bhyve/test/Makefile.subdirs
new file mode 100644
index 0000000000..45f0aa67fa
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.subdirs
@@ -0,0 +1,29 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+.KEEP_STATE:
+
+all := TARGET += all
+clean := TARGET += clean
+clobber := TARGET += clobber
+install := TARGET += install
+lint := TARGET += lint
+
+all clean clobber install lint: $(SUBDIRS)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ
new file mode 100644
index 0000000000..e3ec55cfdb
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.targ
@@ -0,0 +1,55 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+$(ROOTOPTPKG):
+ $(INS.dir)
+
+$(ROOTBIN): $(ROOTOPTPKG)
+ $(INS.dir)
+
+$(ROOTBIN)/%: %.ksh $(ROOTBIN)
+ $(INS.rename)
+
+$(ROOTTST): $(ROOTOPTPKG)
+ $(INS.dir)
+
+$(ROOTTSTDIR): $(ROOTTST)
+ $(INS.dir)
+
+$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR)
+ $(INS.file)
+
+$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR)
+ $(INS.file)
+
+%.exe: %.o $(SUPOBJS)
+ $(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR)
+ $(INS.file)
+
+all: install
+
+%.exe.ln: %.c $(SUPOBJS)
+ $(LINT.c) $< $(LDLIBS)
+
+lint: $(LINTEXE)
+
+clean:
+ -$(RM) *.o $(CLEANFILES)
+
+clobber: clean
+ -$(RM) $(CLOBBERFILES)
diff --git a/usr/src/cmd/bhyve/test/scripts/Makefile b/usr/src/cmd/bhyve/test/scripts/Makefile
new file mode 100644
index 0000000000..d28a5edb8f
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/scripts/Makefile
@@ -0,0 +1,28 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include ../Makefile.com
+
+SRCS = bhyvetest
+SCRIPTS = $(SRCS:%=$(ROOTBIN)/%)
+
+SCRIPTS := FILEMODE = 0555
+CLOBBERFILES = $(SCRIPTS)
+
+install: $(SCRIPTS)
+
+lint:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh
new file mode 100644
index 0000000000..95b7743417
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh
@@ -0,0 +1,231 @@
+#!/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# bhyve test suite driver
+#
+unalias -a
+
+bt_arg0=$(basename $0)
+bt_root="$(cd $(dirname $0)/..; pwd -P)"
+bt_ksh="/usr/bin/ksh"
+bt_outdir=
+bt_keep=
+bt_all=
+bt_tnum=0
+bt_tfail=0
+bt_tsuc=0
+
+function usage
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] || echo "$msg" 2>&1
+ cat <<USAGE >&2
+Usage: $bt_arg0 [ -o dir ] [ -k ] [ -a | test ... ]
+
+ -o dir Sets 'dir' as the output directory
+ -a Runs all tests, ignores tests passed in
+ -k Keep output from all tests, not just failures
+ -m mdb binary to test
+USAGE
+ exit 2
+}
+
+function fatal
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] && msg="failed"
+ echo "$bt_arg0: $msg" >&2
+ exit 1
+}
+
+function setup_outdir
+{
+ bt_outdir="$bt_outdir/$bt_arg0.$$"
+ mkdir -p $bt_outdir || fatal "failed to make output dir $bt_outdir"
+}
+
+function run_single
+{
+ typeset name=$1
+ typeset expect base ext exe command odir res reason
+ typeset iserr
+
+ [[ -z "$name" ]] && fail "missing test to run"
+ base=${name##*/}
+ ext=${base##*.}
+ expect=${base%%.*}
+ odir="$bt_outdir/current"
+ [[ -z "$ext" ]] && fatal "found test without ext: $name"
+ [[ -z "$expect" ]] && fatal "found test without prefix: $name"
+
+ if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then
+ iserr="yup"
+ else
+ iserr=""
+ fi
+
+ case "$ext" in
+ "ksh")
+ command="$bt_ksh ./$base"
+ ;;
+ "exe")
+ command="./$base"
+ ;;
+ "out")
+ #
+ # This is the file format for checking output against.
+ #
+ return 0
+ ;;
+ *)
+ echo "skipping test $name (unknown extensino)"
+ return 0
+ ;;
+ esac
+
+ echo "Executing test $name ... \c"
+ mkdir -p "$odir" >/dev/null || fatal "can't make output directory"
+ cd $(dirname $name) || fatal "failed to enter test directory"
+ $command > "$odir/stdout" 2>"$odir/stderr"
+ res=$?
+ cd - > /dev/null || fatal "failed to leave test directory"
+
+ if [[ -f "$name.out" ]] && \
+ ! diff "$name.out" "$odir/stdout" >/dev/null; then
+ cp $name.out $odir/$base.out
+ reason="stdout mismatch"
+ elif [[ -n "$iserr" && $res -eq 0 ]]; then
+ reason="test exited $res, not non-zero"
+ elif [[ -z "$iserr" && $res -ne 0 ]]; then
+ reason="test exited $res, not zero"
+ fi
+
+ if [[ -n "$reason" ]]; then
+ echo "$reason"
+ ((bt_tfail++))
+ mv "$odir" "$bt_outdir/failure.$bt_tfail" || fatal \
+ "failed to move test output directory"
+ cp "$name" "$bt_outdir/failure.$bt_tfail/$(basename $name)" || \
+ fatal "failed to copy test into output directory"
+ else
+ echo "passed"
+ ((bt_tsuc++))
+ mv "$odir" "$bt_outdir/success.$bt_tsuc" || fatal \
+ "failed to move test directory"
+ fi
+
+ ((bt_tnum++))
+}
+
+function run_all
+{
+ typeset tests t dir
+
+ tests=$(ls -1 $bt_root/tst/*/*.@(ksh|exe))
+ for t in $tests; do
+ run_single $t
+ done
+}
+
+function welcome
+{
+ cat <<WELCOME
+Starting tests...
+output directory: $bt_outdir
+WELCOME
+}
+
+function cleanup
+{
+ [[ -n "$bt_keep" ]] && return
+ rm -rf "$bt_outdir"/success.* || fatal \
+ "failed to remove successful test cases"
+ if [[ $bt_tfail -eq 0 ]]; then
+ rmdir "$bt_outdir" || fatal \
+ "failed to remove test output directory"
+ fi
+}
+
+function goodbye
+{
+ cat <<EOF
+
+-------------
+Results
+-------------
+
+Tests passed: $bt_tsuc
+Tests failed: $bt_tfail
+Tests ran: $bt_tnum
+
+EOF
+ if [[ $bt_tfail -eq 0 ]]; then
+ echo "Congrats, some tiny parts of bhyve aren't completely" \
+ "broken, the tests pass".
+ else
+ echo "Some tests failed, you have some work to do."
+ fi
+}
+
+while getopts ":ahko:m:" c $@; do
+ case "$c" in
+ a)
+ bt_all="y"
+ ;;
+ k)
+ bt_keep="y"
+ ;;
+ o)
+ bt_outdir="$OPTARG"
+ ;;
+ h)
+ usage
+ ;;
+ :)
+ usage "option requires an argument -- $OPTARG"
+ ;;
+ *)
+ usage "invalid option -- $OPTARG"
+ ;;
+ esac
+done
+
+shift $((OPTIND-1))
+
+[[ -z "$bt_all" && $# == 0 ]] && usage "no tests to run"
+
+[[ -z "$bt_outdir" ]] && bt_outdir="$PWD"
+
+setup_outdir
+welcome
+
+if [[ ! -z "$bt_all" ]]; then
+ run_all
+else
+ for t in $@; do
+ [[ -f $t ]] || fatal "cannot find test $t"
+ run_single $t
+ done
+fi
+
+goodbye
+cleanup
+
+#
+# Exit 1 if we have tests that return non-zero
+#
+[[ $bt_tfai -eq 0 ]]
diff --git a/usr/src/cmd/bhyve/test/tst/Makefile b/usr/src/cmd/bhyve/test/tst/Makefile
new file mode 100644
index 0000000000..f6a6ec96fc
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+SUBDIRS = mevent
+
+include ../Makefile.subdirs
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/Makefile b/usr/src/cmd/bhyve/test/tst/mevent/Makefile
new file mode 100644
index 0000000000..047886bc6a
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/Makefile
@@ -0,0 +1,30 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+TSTDIR = mevent
+EXETESTS = \
+ lists.delete.exe \
+ read.disable.exe \
+ read.pause.exe \
+ read.requeue.exe \
+
+SHTESTS =
+SUPOBJS = mevent.o testlib.o
+
+include ../../Makefile.com
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c
new file mode 100644
index 0000000000..d09ac133a3
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c
@@ -0,0 +1,172 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test: lists.delete
+ * Assertion: mevent_delete() causes the total number of events to decrease
+ *
+ * Strategy: 1. Create a pipe.
+ * 2. Call mevent_add() to be notified of writes to the pipe. The
+ * callback will do nothing other than generate an error if it
+ * is called.
+ * 3. Create another pipe and add a read event watcher to it. The
+ * callback will signal a cv when called. A write to the pipe
+ * followed by a wait on the cv will ensure that async
+ * operations in mevent.c are complete. See flush_and_wait().
+ * 4. Call flush_and_wait(), then get event count.
+ * 5. Delete the event created in step 2.
+ * 6. Call flush_and_wait(), then get event count.
+ * 7. Verify result in step 6 is one less than result in step 4.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static int
+get_count(void)
+{
+ int global = -1, change = -1, del_pending = -1;
+ int total;
+
+ test_mevent_count_lists(&global, &change, &del_pending);
+ ASSERT_INT_NEQ(("count not set"), global, -1);
+ ASSERT_INT_NEQ(("count not set"), change, -1);
+ ASSERT_INT_NEQ(("count not set"), change, -1);
+ ASSERT_INT_EQ(("pending delete not processed"), del_pending, 0);
+
+ total = global + change + del_pending;
+
+ VERBOSE(("count = %d (%d + %d + %d)", total, global, change,
+ del_pending));
+
+ return (total);
+}
+
+static void
+not_called_cb(int fd, enum ev_type ev, void *arg)
+{
+ FAIL(("this callback should never be called"));
+}
+
+static void
+flush_cb(int fd, enum ev_type ev, void *arg)
+{
+ char buf[32];
+
+ /* Drain the pipe */
+ while (read(fd, buf, sizeof (buf)) > 0)
+ ;
+
+ pthread_mutex_lock(&mtx);
+ pthread_cond_signal(&cv);
+ pthread_mutex_unlock(&mtx);
+}
+
+void
+flush_and_wait(int fd)
+{
+ uint8_t msg = 42;
+
+ /*
+ * Lock taken ahead of waking flush_cb so this thread doesn't race
+ * with the event thread.
+ */
+ pthread_mutex_lock(&mtx);
+ if (write(fd, &msg, sizeof (msg)) != sizeof (msg)) {
+ FAIL(("bad write"));
+ }
+
+ /* Wait for it to be read */
+ pthread_cond_wait(&cv, &mtx);
+ pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+ int unused_pipe[2];
+ int flush_pipe[2];
+ struct mevent *unused_evp, *flush_evp;
+ int count1, count2;
+
+ start_test(argv[0], 5);
+ start_event_thread();
+
+ /*
+ * Create first pipe and related event
+ */
+ if (pipe(unused_pipe) != 0) {
+ FAIL_ERRNO("pipe");
+ }
+ VERBOSE(("unused_pipe[] = { %d, %d }", unused_pipe[0], unused_pipe[1]));
+ if (fcntl(unused_pipe[0], F_SETFL, O_NONBLOCK) != 0) {
+ FAIL_ERRNO("set pipe nonblocking");
+ }
+ unused_evp = mevent_add(unused_pipe[0], EVF_READ, not_called_cb, NULL);
+ ASSERT_PTR_NEQ(("mevent_add"), unused_evp, NULL);
+
+ /*
+ * Create flush pipe and related event
+ */
+ if (pipe(flush_pipe) != 0) {
+ FAIL_ERRNO("pipe");
+ }
+ VERBOSE(("flush_pipe[] = { %d, %d }", flush_pipe[0],
+ flush_pipe[1]));
+ if (fcntl(flush_pipe[0], F_SETFL, O_NONBLOCK) != 0) {
+ FAIL_ERRNO("set pipe nonblocking");
+ }
+ flush_evp = mevent_add(flush_pipe[0], EVF_READ, flush_cb, NULL);
+ ASSERT_PTR_NEQ(("mevent_add"), flush_evp, NULL);
+
+ /* Get count before delete. */
+ flush_and_wait(flush_pipe[1]);
+ count1 = get_count();
+
+ /*
+ * Delete the first event and flush a read after the delete is
+ * complete.
+ */
+ if (mevent_delete(unused_evp) != 0) {
+ FAIL_ERRNO("mevent_delete");
+ }
+
+ /*
+ * Verify count decreased.
+ */
+ flush_and_wait(flush_pipe[1]);
+ count2 = get_count();
+ if (count1 - 1 != count2 ) {
+ FAIL(("mevent_delete() did not decrease count by 1: "
+ "was %d, now %d", count1, count2));
+ }
+
+ PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/mevent.c b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c
new file mode 100644
index 0000000000..17b6546847
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c
@@ -0,0 +1,57 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include "../../../mevent.c"
+#include "testlib.h"
+
+/*
+ * Returns by reference the number of events on the global and change lists.
+ *
+ * Used by tests that wish to ensure that the event count changes as suggested
+ * by mevent_add() and mevent_delete(). Note that a delete does not immediately
+ * delete an event. Events that are pending delete are included in the change
+ * list until the next pass through the change list to process pending changes.
+ */
+void
+test_mevent_count_lists(int *ret_global, int *ret_change, int *ret_del_pending)
+{
+ struct mevent *mevp;
+ int global = 0;
+ int change = 0;
+ int del_pending = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH(mevp, &global_head, me_list) {
+ global++;
+ VERBOSE(("on global: type %d fd %d state %d", mevp->me_type,
+ mevp->me_fd, mevp->me_state));
+ }
+
+ LIST_FOREACH(mevp, &change_head, me_list) {
+ change++;
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ del_pending++;
+ }
+ VERBOSE(("on change: type %d fd %d state %d", mevp->me_type,
+ mevp->me_fd, mevp->me_state));
+ }
+
+ mevent_qunlock();
+
+ *ret_global = global;
+ *ret_change = change;
+ *ret_del_pending = del_pending;
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c
new file mode 100644
index 0000000000..d23b1af96c
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c
@@ -0,0 +1,163 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test: read.cancel
+ * Assertion: A read is not requeued if mevent_disable() is called while it is
+ * being handled.
+ *
+ * Strategy: 1. Create a pipe
+ * 2. Call mevent_add() to be notified of writes to the pipe. The
+ * callback will signal a cv.
+ * 3. Write to the pipe then wait for a wakeup.
+ * 4. From the read event callback, disable the event then awaken
+ * the main thread.
+ * 5. In the main thread, add a timer event that will awaken the
+ * main thread after a short delay.
+ * 5. Write to the pipe and wait to be awoken. The wakeup should
+ * come from the timer event, not the read event.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+typedef enum {
+ CB_NONE,
+ CB_READ,
+ CB_TIMER,
+} lastwake_t;
+
+static lastwake_t lastwake = CB_NONE;
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static struct mevent *read_event;
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+ ssize_t nbytes;
+ char buf[32] = { 0 };
+ int err;
+
+ if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+ FAIL_ERRNO("bad read");
+ }
+ VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+ err = mevent_disable(read_event);
+ ASSERT_INT_EQ(("mevent_disable: ", strerror(err)), err, 0);
+
+ pthread_mutex_lock(&mtx);
+
+ ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_NONE);
+ lastwake = CB_READ;
+
+ pthread_cond_signal(&cv);
+ VERBOSE(("wakeup"));
+
+ pthread_mutex_unlock(&mtx);
+}
+
+static void
+tick(int ms, enum ev_type ev, void *arg)
+{
+ pthread_mutex_lock(&mtx);
+
+ ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ);
+ lastwake = CB_TIMER;
+
+ pthread_cond_signal(&cv);
+ VERBOSE(("wakeup"));
+
+ pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+ int pipefds[2];
+ struct mevent *timer;
+ ssize_t written;
+ char *msgs[] = { "first", "second" };
+ char *msg;
+
+ start_test(argv[0], 5);
+ start_event_thread();
+
+ if (pipe(pipefds) != 0) {
+ FAIL_ERRNO("pipe");
+ }
+ if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+ FAIL_ERRNO("set pipe nonblocking");
+ }
+
+ /*
+ * First write
+ */
+ msg = msgs[0];
+ read_event = mevent_add(pipefds[0], EVF_READ, munch, msg);
+ ASSERT_PTR_NEQ(("mevent_add pipefd"), read_event, NULL);
+
+ pthread_mutex_lock(&mtx);
+ written = write(pipefds[1], msg, strlen(msg));
+ if (written < 0) {
+ FAIL_ERRNO("bad write");
+ }
+ ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg));
+
+ /*
+ * Wait for it to be read
+ */
+ pthread_cond_wait(&cv, &mtx);
+ ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ);
+ pthread_mutex_unlock(&mtx);
+
+ /*
+ * Add timer, second write.
+ */
+ msg = msgs[1];
+ timer = mevent_add(50, EVF_TIMER, tick, msg);
+ ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL);
+
+ pthread_mutex_lock(&mtx);
+ written = write(pipefds[1], msg, strlen(msg));
+ if (written < 0) {
+ FAIL_ERRNO("bad write");
+ }
+ ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg));
+
+ /*
+ * Wait for timer to expire
+ */
+ pthread_cond_wait(&cv, &mtx);
+ ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_TIMER);
+ pthread_mutex_unlock(&mtx);
+
+ PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c
new file mode 100644
index 0000000000..c877f014f6
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c
@@ -0,0 +1,152 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test: read.pause
+ * Assertion: mevent_disable() can be used to pause reads.
+ *
+ * Strategy: 1. Create a pipe
+ * 2. Call mevent_add() to be notified of writes to the pipe. The
+ * callback will signal a cv.
+ * 3. In a loop, write to the pipe then wait on the cv.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static char cookie[] = "Chocolate chip with fudge stripes";
+
+/*
+ * After this many bytes are sent, writes will get batched up, progress will be
+ * made on the write side via an interval timer
+ */
+const int pauseat = 8;
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+ static int i = 0;
+ char buf[sizeof (cookie)] = { 0 };
+ ssize_t nbytes;
+ ssize_t expected;
+
+ ASSERT_INT_EQ(("bad event"), ev, EVF_READ);
+ ASSERT_PTR_EQ(("bad cookie"), arg, cookie);
+
+ /*
+ * For the first while, expect data to come a byte at a time. After the
+ * pause, we should get a burst with the rest of the data.
+ */
+ if (i > pauseat) {
+ expected = strlen(cookie) - pauseat - 1;
+ } else {
+ expected = 1;
+ }
+
+ if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+ FAIL_ERRNO("bad read");
+ }
+ VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+ ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, expected);
+
+ if (expected == 1) {
+ ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]);
+ } else {
+ ASSERT_STR_EQ(("bad last half of cookie"), buf, &cookie[i]);
+ }
+
+ pthread_mutex_lock(&mtx);
+ pthread_cond_signal(&cv);
+ VERBOSE(("wakeup"));
+ pthread_mutex_unlock(&mtx);
+
+ i++;
+}
+
+static void
+tick(int ms, enum ev_type ev, void *arg)
+{
+ pthread_mutex_lock(&mtx);
+ pthread_cond_signal(&cv);
+ VERBOSE(("wakeup"));
+ pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+ int pipefds[2];
+ struct mevent *evp, *timer;
+ ssize_t written;
+
+ start_test(argv[0], 5);
+ start_event_thread();
+
+ if (pipe(pipefds) != 0) {
+ FAIL_ERRNO("pipe");
+ }
+ if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+ FAIL_ERRNO("set pipe nonblocking");
+ }
+
+ evp = mevent_add(pipefds[0], EVF_READ, munch, cookie);
+ ASSERT_PTR_NEQ(("mevent_add pipefd"), evp, NULL);
+
+ for (int i = 0; cookie[i] != 0; i++) {
+ pthread_mutex_lock(&mtx);
+ written = write(pipefds[1], cookie + i, 1);
+ if (written < 0) {
+ FAIL_ERRNO("bad write");
+ }
+ ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1);
+
+ /* Wait for it to be read */
+ pthread_cond_wait(&cv, &mtx);
+ pthread_mutex_unlock(&mtx);
+
+ if (i == pauseat) {
+ timer = mevent_add(10, EVF_TIMER, tick,
+ &cookie[pauseat]);
+ ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL);
+ VERBOSE(("disable munch"));
+ mevent_disable(evp);
+ }
+ }
+
+ pthread_mutex_lock(&mtx);
+
+ mevent_enable(evp);
+
+ pthread_cond_wait(&cv, &mtx);
+ pthread_mutex_unlock(&mtx);
+
+ PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c
new file mode 100644
index 0000000000..ddc3e27235
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c
@@ -0,0 +1,108 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test: read.requeue
+ * Assertion: A sequence of writes turns into a sequence of events.
+ *
+ * Strategy: 1. Create a pipe
+ * 2. Call mevent_add() to be notified of writes to the pipe. The
+ * callback will signal a cv.
+ * 3. In a loop, write to the pipe then wait on the cv.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static char *cookie = "Chocolate chip with fudge stripes";
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+ static int i = 0;
+ char buf[8] = { 0 };
+ ssize_t nbytes;
+
+ ASSERT_INT_EQ(("bad event"), ev, EVF_READ);
+ ASSERT_PTR_EQ(("bad cookie"), arg, cookie);
+
+ if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+ ASSERT_INT64_EQ(("bad read: %s", strerror(errno)), nbytes, 1);
+ }
+ VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+ ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, 1);
+
+ ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]);
+
+ pthread_mutex_lock(&mtx);
+ pthread_cond_signal(&cv);
+ VERBOSE(("wakeup"));
+ pthread_mutex_unlock(&mtx);
+
+ i++;
+}
+
+int
+main(int argc, const char *argv[])
+{
+ int pipefds[2];
+ struct mevent *evp;
+
+ start_test(argv[0], 5);
+ start_event_thread();
+
+ if (pipe(pipefds) != 0) {
+ FAIL_ERRNO("pipe");
+ }
+ if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+ FAIL_ERRNO("set pipe nonblocking");
+ }
+
+ evp = mevent_add(pipefds[0], EVF_READ, munch, cookie);
+ ASSERT_PTR_NEQ(("mevent_add"), evp, NULL);
+
+ for (int i = 0; cookie[i] != '\0'; i++) {
+ ssize_t written;
+
+ pthread_mutex_lock(&mtx);
+ written = write(pipefds[1], cookie + i, 1);
+ if (written < 0) {
+ FAIL_ERRNO("bad write");
+ }
+ ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1);
+
+ /* Wait for it to be read */
+ pthread_cond_wait(&cv, &mtx);
+ pthread_mutex_unlock(&mtx);
+ }
+
+ PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.c b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c
new file mode 100644
index 0000000000..af756d1509
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <pthread.h>
+#include <signal.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+const char *testlib_prog;
+boolean_t testlib_verbose;
+
+static void
+timed_out(int signo) {
+ ASSERT_INT_EQ(("timeout signal"), signo, SIGALRM);
+
+ FAIL(("Timed out"));
+}
+
+void
+start_test(const char *argv0, uint32_t timeout)
+{
+ char *val;
+
+ testlib_prog = strrchr(argv0, '/');
+ if (testlib_prog == NULL) {
+ testlib_prog = argv0;
+ } else {
+ testlib_prog++;
+ }
+
+ testlib_verbose = ((val = getenv("TEST_VERBOSE")) != NULL) &&
+ val[0] != '\0';
+
+ signal(SIGALRM, timed_out);
+ alarm(timeout);
+}
+
+/* ARGSUSED */
+static void *
+event_thread(void *arg)
+{
+ mevent_dispatch();
+ return (NULL);
+}
+
+void
+start_event_thread(void)
+{
+ pthread_t tid;
+
+ if (pthread_create(&tid, NULL, event_thread, NULL) != 0) {
+ FAIL_ERRNO("pthread_create");
+ }
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.h b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h
new file mode 100644
index 0000000000..80949f3cc7
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h
@@ -0,0 +1,88 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "mevent.h"
+
+#define EXIT_PASS 0
+#define EXIT_FAIL 1
+
+#define VERBOSE(msg) \
+ if (testlib_verbose) { \
+ (void) printf("VERBOSE %s: %s:%d %s: ", testlib_prog, \
+ __FILE__, __LINE__, __func__); \
+ (void) printf msg; \
+ (void) printf("\n"); \
+ }
+
+#define FAIL_PROLOGUE() \
+ (void) printf("FAIL %s: %s:%d: ", testlib_prog, __FILE__, __LINE__)
+
+#define FAIL(msg) \
+ { \
+ FAIL_PROLOGUE(); \
+ (void) printf msg; \
+ (void) printf("\n"); \
+ exit(EXIT_FAIL); \
+ }
+
+#define FAIL_ERRNO(msg) FAIL((msg ": %s", strerror(errno)))
+
+#define PASS() \
+ { \
+ (void) printf("PASS %s\n", testlib_prog); \
+ exit(EXIT_PASS); \
+ }
+
+#define ASSERT_CMP(msg, got, cmp, exp, nfmt) \
+ if (!(got cmp exp)) { \
+ FAIL_PROLOGUE(); \
+ (void) printf msg; \
+ (void) printf(": %s=" nfmt " %s %s=" nfmt "\n", \
+ #got, got, #cmp, #exp, exp); \
+ exit(EXIT_FAIL); \
+ }
+
+#define ASSERT_CHAR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%c")
+#define ASSERT_INT_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%d")
+#define ASSERT_INT_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%d")
+#define ASSERT_INT64_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%ld")
+#define ASSERT_PTR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%p")
+#define ASSERT_PTR_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%p")
+
+#define ASSERT_STR_EQ(msg, got, exp) \
+ if (strcmp(got, exp) != 0) { \
+ FAIL_PROLOGUE(); \
+ (void) printf msg; \
+ (void) printf(": %s='%s' != %s='%s'\n", \
+ #got, got, #exp, exp); \
+ exit(EXIT_FAIL); \
+ }
+
+extern const char *testlib_prog;
+extern boolean_t testlib_verbose;
+
+extern void start_test(const char *, uint32_t);
+extern void start_event_thread(void);
+extern void test_mevent_count_lists(int *, int *, int *);
diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c
new file mode 100644
index 0000000000..1027d0b0f6
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.c
@@ -0,0 +1,955 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <dev/ic/ns16550.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#include <capsicum_helpers.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+#include <sysexits.h>
+#ifndef __FreeBSD__
+#include <sys/socket.h>
+#endif
+
+#include "mevent.h"
+#include "uart_emul.h"
+
+#define COM1_BASE 0x3F8
+#define COM1_IRQ 4
+#define COM2_BASE 0x2F8
+#define COM2_IRQ 3
+
+#define DEFAULT_RCLK 1843200
+#define DEFAULT_BAUD 9600
+
+#define FCR_RX_MASK 0xC0
+
+#define MCR_OUT1 0x04
+#define MCR_OUT2 0x08
+
+#define MSR_DELTA_MASK 0x0f
+
+#ifndef REG_SCR
+#define REG_SCR com_scr
+#endif
+
+#define FIFOSZ 16
+
+static bool uart_stdio; /* stdio in use for i/o */
+static struct termios tio_stdio_orig;
+
+static struct {
+ int baseaddr;
+ int irq;
+ bool inuse;
+} uart_lres[] = {
+ { COM1_BASE, COM1_IRQ, false},
+ { COM2_BASE, COM2_IRQ, false},
+};
+
+#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0]))
+
+struct fifo {
+ uint8_t buf[FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of characters in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct ttyfd {
+ bool opened;
+ int fd; /* tty device file descriptor */
+ struct termios tio_orig, tio_new; /* I/O Terminals */
+};
+
+struct uart_softc {
+ pthread_mutex_t mtx; /* protects all softc elements */
+ uint8_t data; /* Data register (R/W) */
+ uint8_t ier; /* Interrupt enable register (R/W) */
+ uint8_t lcr; /* Line control register (R/W) */
+ uint8_t mcr; /* Modem control register (R/W) */
+ uint8_t lsr; /* Line status register (R/W) */
+ uint8_t msr; /* Modem status register (R/W) */
+ uint8_t fcr; /* FIFO control register (W) */
+ uint8_t scr; /* Scratch register (R/W) */
+
+ uint8_t dll; /* Baudrate divisor latch LSB */
+ uint8_t dlh; /* Baudrate divisor latch MSB */
+
+ struct fifo rxfifo;
+ struct mevent *mev;
+
+ struct ttyfd tty;
+#ifndef __FreeBSD__
+ bool sock;
+ struct {
+ int clifd; /* console client unix domain socket */
+ int servfd; /* console server unix domain socket */
+ struct mevent *servmev; /* mevent for server socket */
+ } usc_sock;
+#endif
+
+ bool thre_int_pending; /* THRE interrupt pending */
+
+ void *arg;
+ uart_intr_func_t intr_assert;
+ uart_intr_func_t intr_deassert;
+};
+
+static void uart_drain(int fd, enum ev_type ev, void *arg);
+
+static void
+ttyclose(void)
+{
+
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig);
+}
+
+static void
+ttyopen(struct ttyfd *tf)
+{
+
+ tcgetattr(tf->fd, &tf->tio_orig);
+
+ tf->tio_new = tf->tio_orig;
+ cfmakeraw(&tf->tio_new);
+ tf->tio_new.c_cflag |= CLOCAL;
+ tcsetattr(tf->fd, TCSANOW, &tf->tio_new);
+
+ if (tf->fd == STDIN_FILENO) {
+ tio_stdio_orig = tf->tio_orig;
+ atexit(ttyclose);
+ }
+}
+
+static int
+ttyread(struct ttyfd *tf)
+{
+ unsigned char rb;
+
+ if (read(tf->fd, &rb, 1) == 1)
+ return (rb);
+ else
+ return (-1);
+}
+
+static void
+ttywrite(struct ttyfd *tf, unsigned char wb)
+{
+
+ (void)write(tf->fd, &wb, 1);
+}
+
+#ifndef __FreeBSD__
+static void
+sockwrite(struct uart_softc *sc, unsigned char wb)
+{
+ (void) write(sc->usc_sock.clifd, &wb, 1);
+}
+#endif
+
+static void
+rxfifo_reset(struct uart_softc *sc, int size)
+{
+ char flushbuf[32];
+ struct fifo *fifo;
+ ssize_t nread;
+ int error;
+
+ fifo = &sc->rxfifo;
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = size;
+
+ if (sc->tty.opened) {
+ /*
+ * Flush any unread input from the tty buffer.
+ */
+ while (1) {
+ nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf));
+ if (nread != sizeof(flushbuf))
+ break;
+ }
+
+ /*
+ * Enable mevent to trigger when new characters are available
+ * on the tty fd.
+ */
+ error = mevent_enable(sc->mev);
+ assert(error == 0);
+ }
+#ifndef __FreeBSD__
+ if (sc->sock && sc->usc_sock.clifd != -1) {
+ /* Flush any unread input from the socket buffer. */
+ do {
+ nread = read(sc->usc_sock.clifd, flushbuf,
+ sizeof (flushbuf));
+ } while (nread == sizeof (flushbuf));
+
+ /* Enable mevent to trigger when new data available on sock */
+ error = mevent_enable(sc->mev);
+ assert(error == 0);
+ }
+#endif /* __FreeBSD__ */
+}
+
+static int
+rxfifo_available(struct uart_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->rxfifo;
+ return (fifo->num < fifo->size);
+}
+
+static int
+rxfifo_putchar(struct uart_softc *sc, uint8_t ch)
+{
+ struct fifo *fifo;
+ int error;
+
+ fifo = &sc->rxfifo;
+
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = ch;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ if (!rxfifo_available(sc)) {
+ if (sc->tty.opened) {
+ /*
+ * Disable mevent callback if the FIFO is full.
+ */
+ error = mevent_disable(sc->mev);
+ assert(error == 0);
+ }
+#ifndef __FreeBSD__
+ if (sc->sock && sc->usc_sock.clifd != -1) {
+ /*
+ * Disable mevent callback if the FIFO is full.
+ */
+ error = mevent_disable(sc->mev);
+ assert(error == 0);
+ }
+#endif /* __FreeBSD__ */
+ }
+ return (0);
+ } else
+ return (-1);
+}
+
+static int
+rxfifo_getchar(struct uart_softc *sc)
+{
+ struct fifo *fifo;
+ int c, error, wasfull;
+
+ wasfull = 0;
+ fifo = &sc->rxfifo;
+ if (fifo->num > 0) {
+ if (!rxfifo_available(sc))
+ wasfull = 1;
+ c = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ if (wasfull) {
+ if (sc->tty.opened) {
+ error = mevent_enable(sc->mev);
+ assert(error == 0);
+ }
+#ifndef __FreeBSD__
+ if (sc->sock && sc->usc_sock.clifd != -1) {
+ error = mevent_enable(sc->mev);
+ assert(error == 0);
+ }
+#endif /* __FreeBSD__ */
+ }
+ return (c);
+ } else
+ return (-1);
+}
+
+static int
+rxfifo_numchars(struct uart_softc *sc)
+{
+ struct fifo *fifo = &sc->rxfifo;
+
+ return (fifo->num);
+}
+
+static void
+uart_opentty(struct uart_softc *sc)
+{
+ ttyopen(&sc->tty);
+ sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc);
+ assert(sc->mev != NULL);
+}
+
+static uint8_t
+modem_status(uint8_t mcr)
+{
+ uint8_t msr;
+
+ if (mcr & MCR_LOOPBACK) {
+ /*
+ * In the loopback mode certain bits from the MCR are
+ * reflected back into MSR.
+ */
+ msr = 0;
+ if (mcr & MCR_RTS)
+ msr |= MSR_CTS;
+ if (mcr & MCR_DTR)
+ msr |= MSR_DSR;
+ if (mcr & MCR_OUT1)
+ msr |= MSR_RI;
+ if (mcr & MCR_OUT2)
+ msr |= MSR_DCD;
+ } else {
+ /*
+ * Always assert DCD and DSR so tty open doesn't block
+ * even if CLOCAL is turned off.
+ */
+ msr = MSR_DCD | MSR_DSR;
+ }
+ assert((msr & MSR_DELTA_MASK) == 0);
+
+ return (msr);
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+uart_intr_reason(struct uart_softc *sc)
+{
+
+ if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+ return (IIR_RLS);
+ else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0)
+ return (IIR_RXTOUT);
+ else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+ return (IIR_TXRDY);
+ else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+ return (IIR_MLSC);
+ else
+ return (IIR_NOPEND);
+}
+
+static void
+uart_reset(struct uart_softc *sc)
+{
+ uint16_t divisor;
+
+ divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+ sc->dll = divisor;
+ sc->dlh = divisor >> 16;
+ sc->msr = modem_status(sc->mcr);
+
+ rxfifo_reset(sc, 1); /* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+uart_toggle_intr(struct uart_softc *sc)
+{
+ uint8_t intr_reason;
+
+ intr_reason = uart_intr_reason(sc);
+
+ if (intr_reason == IIR_NOPEND)
+ (*sc->intr_deassert)(sc->arg);
+ else
+ (*sc->intr_assert)(sc->arg);
+}
+
+static void
+uart_drain(int fd, enum ev_type ev, void *arg)
+{
+ struct uart_softc *sc;
+ int ch;
+
+ sc = arg;
+
+ assert(fd == sc->tty.fd);
+ assert(ev == EVF_READ);
+
+ /*
+ * This routine is called in the context of the mevent thread
+ * to take out the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) ttyread(&sc->tty);
+ } else {
+ while (rxfifo_available(sc) &&
+ ((ch = ttyread(&sc->tty)) != -1)) {
+ rxfifo_putchar(sc, ch);
+ }
+ uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+uart_write(struct uart_softc *sc, int offset, uint8_t value)
+{
+ int fifosz;
+ uint8_t msr;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ sc->dll = value;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ sc->dlh = value;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ if (sc->mcr & MCR_LOOPBACK) {
+ if (rxfifo_putchar(sc, value) != 0)
+ sc->lsr |= LSR_OE;
+ } else if (sc->tty.opened) {
+ ttywrite(&sc->tty, value);
+#ifndef __FreeBSD__
+ } else if (sc->sock) {
+ sockwrite(sc, value);
+#endif
+ } /* else drop on floor */
+ sc->thre_int_pending = true;
+ break;
+ case REG_IER:
+#ifndef __FreeBSD__
+ /*
+ * Assert an interrupt if re-enabling the THRE intr, since we
+ * always report THRE as active in the status register.
+ */
+ if ((sc->ier & IER_ETXRDY) == 0 &&
+ (value & IER_ETXRDY) != 0) {
+ sc->thre_int_pending = true;
+ }
+#endif
+ /*
+ * Apply mask so that bits 4-7 are 0
+ * Also enables bits 0-3 only if they're 1
+ */
+ sc->ier = value & 0x0F;
+ break;
+ case REG_FCR:
+ /*
+ * When moving from FIFO and 16450 mode and vice versa,
+ * the FIFO contents are reset.
+ */
+ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+ fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+ rxfifo_reset(sc, fifosz);
+ }
+
+ /*
+ * The FCR_ENABLE bit must be '1' for the programming
+ * of other FCR bits to be effective.
+ */
+ if ((value & FCR_ENABLE) == 0) {
+ sc->fcr = 0;
+ } else {
+ if ((value & FCR_RCV_RST) != 0)
+ rxfifo_reset(sc, FIFOSZ);
+
+ sc->fcr = value &
+ (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+ }
+ break;
+ case REG_LCR:
+ sc->lcr = value;
+ break;
+ case REG_MCR:
+ /* Apply mask so that bits 5-7 are 0 */
+ sc->mcr = value & 0x1F;
+ msr = modem_status(sc->mcr);
+
+ /*
+ * Detect if there has been any change between the
+ * previous and the new value of MSR. If there is
+ * then assert the appropriate MSR delta bit.
+ */
+ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+ sc->msr |= MSR_DCTS;
+ if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+ sc->msr |= MSR_DDSR;
+ if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+ sc->msr |= MSR_DDCD;
+ if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+ sc->msr |= MSR_TERI;
+
+ /*
+ * Update the value of MSR while retaining the delta
+ * bits.
+ */
+ sc->msr &= MSR_DELTA_MASK;
+ sc->msr |= msr;
+ break;
+ case REG_LSR:
+ /*
+ * Line status register is not meant to be written to
+ * during normal operation.
+ */
+ break;
+ case REG_MSR:
+ /*
+ * As far as I can tell MSR is a read-only register.
+ */
+ break;
+ case REG_SCR:
+ sc->scr = value;
+ break;
+ default:
+ break;
+ }
+
+done:
+ uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+uint8_t
+uart_read(struct uart_softc *sc, int offset)
+{
+ uint8_t iir, intr_reason, reg;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ reg = sc->dll;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ reg = sc->dlh;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ reg = rxfifo_getchar(sc);
+ break;
+ case REG_IER:
+ reg = sc->ier;
+ break;
+ case REG_IIR:
+ iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+ intr_reason = uart_intr_reason(sc);
+
+ /*
+ * Deal with side effects of reading the IIR register
+ */
+ if (intr_reason == IIR_TXRDY)
+ sc->thre_int_pending = false;
+
+ iir |= intr_reason;
+
+ reg = iir;
+ break;
+ case REG_LCR:
+ reg = sc->lcr;
+ break;
+ case REG_MCR:
+ reg = sc->mcr;
+ break;
+ case REG_LSR:
+ /* Transmitter is always ready for more data */
+ sc->lsr |= LSR_TEMT | LSR_THRE;
+
+ /* Check for new receive data */
+ if (rxfifo_numchars(sc) > 0)
+ sc->lsr |= LSR_RXRDY;
+ else
+ sc->lsr &= ~LSR_RXRDY;
+
+ reg = sc->lsr;
+
+ /* The LSR_OE bit is cleared on LSR read */
+ sc->lsr &= ~LSR_OE;
+ break;
+ case REG_MSR:
+ /*
+ * MSR delta bits are cleared on read
+ */
+ reg = sc->msr;
+ sc->msr &= ~MSR_DELTA_MASK;
+ break;
+ case REG_SCR:
+ reg = sc->scr;
+ break;
+ default:
+ reg = 0xFF;
+ break;
+ }
+
+done:
+ uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (reg);
+}
+
+#ifndef __FreeBSD__
+static void
+uart_sock_drain(int fd, enum ev_type ev, void *arg)
+{
+ struct uart_softc *sc = arg;
+ char ch;
+
+ /*
+ * Take the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) read(sc->usc_sock.clifd, &ch, 1);
+ } else {
+ bool err_close = false;
+
+ while (rxfifo_available(sc)) {
+ int res;
+
+ res = read(sc->usc_sock.clifd, &ch, 1);
+ if (res == 0) {
+ err_close = true;
+ break;
+ } else if (res == -1) {
+ if (errno != EAGAIN && errno != EINTR) {
+ err_close = true;
+ }
+ break;
+ }
+
+ rxfifo_putchar(sc, ch);
+ }
+ uart_toggle_intr(sc);
+
+ if (err_close) {
+ (void) fprintf(stderr, "uart: closing client conn\n");
+ (void) shutdown(sc->usc_sock.clifd, SHUT_RDWR);
+ mevent_delete_close(sc->mev);
+ sc->mev = NULL;
+ sc->usc_sock.clifd = -1;
+ }
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+uart_sock_accept(int fd, enum ev_type ev, void *arg)
+{
+ struct uart_softc *sc = arg;
+ int connfd;
+
+ connfd = accept(sc->usc_sock.servfd, NULL, NULL);
+ if (connfd == -1) {
+ return;
+ }
+
+ /*
+ * Do client connection management under protection of the softc lock
+ * to avoid racing with concurrent UART events.
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if (sc->usc_sock.clifd != -1) {
+ /* we're already handling a client */
+ (void) fprintf(stderr, "uart: unexpected client conn\n");
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+ } else {
+ if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) {
+ perror("uart: fcntl(O_NONBLOCK)");
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+ } else {
+ sc->usc_sock.clifd = connfd;
+ sc->mev = mevent_add(sc->usc_sock.clifd, EVF_READ,
+ uart_sock_drain, sc);
+ }
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static int
+init_sock(const char *path)
+{
+ int servfd;
+ struct sockaddr_un servaddr;
+
+ bzero(&servaddr, sizeof (servaddr));
+ servaddr.sun_family = AF_UNIX;
+
+ if (strlcpy(servaddr.sun_path, path, sizeof (servaddr.sun_path)) >=
+ sizeof (servaddr.sun_path)) {
+ (void) fprintf(stderr, "uart: path '%s' too long\n",
+ path);
+ return (-1);
+ }
+
+ if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ (void) fprintf(stderr, "uart: socket() error - %s\n",
+ strerror(errno));
+ return (-1);
+ }
+ (void) unlink(servaddr.sun_path);
+
+ if (bind(servfd, (struct sockaddr *)&servaddr,
+ sizeof (servaddr)) == -1) {
+ (void) fprintf(stderr, "uart: bind() error - %s\n",
+ strerror(errno));
+ goto out;
+ }
+
+ if (listen(servfd, 1) == -1) {
+ (void) fprintf(stderr, "uart: listen() error - %s\n",
+ strerror(errno));
+ goto out;
+ }
+ return (servfd);
+
+out:
+ (void) unlink(servaddr.sun_path);
+ (void) close(servfd);
+ return (-1);
+}
+#endif /* not __FreeBSD__ */
+
+int
+uart_legacy_alloc(int which, int *baseaddr, int *irq)
+{
+
+ if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
+ return (-1);
+
+ uart_lres[which].inuse = true;
+ *baseaddr = uart_lres[which].baseaddr;
+ *irq = uart_lres[which].irq;
+
+ return (0);
+}
+
+struct uart_softc *
+uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
+ void *arg)
+{
+ struct uart_softc *sc;
+
+ sc = calloc(1, sizeof(struct uart_softc));
+
+ sc->arg = arg;
+ sc->intr_assert = intr_assert;
+ sc->intr_deassert = intr_deassert;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ uart_reset(sc);
+
+ return (sc);
+}
+
+static int
+uart_tty_backend(struct uart_softc *sc, const char *opts)
+{
+ int fd;
+ int retval;
+
+ retval = -1;
+
+ fd = open(opts, O_RDWR | O_NONBLOCK);
+ if (fd > 0 && isatty(fd)) {
+ sc->tty.fd = fd;
+ sc->tty.opened = true;
+ retval = 0;
+ }
+
+ return (retval);
+}
+
+#ifndef __FreeBSD__
+static int
+uart_sock_backend(struct uart_softc *sc, const char *inopts)
+{
+ char *opts;
+ char *opt;
+ char *nextopt;
+ char *path = NULL;
+
+ if (strncmp(inopts, "socket,", 7) != 0) {
+ return (-1);
+ }
+ if ((opts = strdup(inopts + 7)) == NULL) {
+ return (-1);
+ }
+
+ nextopt = opts;
+ for (opt = strsep(&nextopt, ","); opt != NULL;
+ opt = strsep(&nextopt, ",")) {
+ if (path == NULL && *opt == '/') {
+ path = opt;
+ continue;
+ }
+ /*
+ * XXX check for server and client options here. For now,
+ * everything is a server
+ */
+ free(opts);
+ return (-1);
+ }
+
+ sc->usc_sock.clifd = -1;
+ if ((sc->usc_sock.servfd = init_sock(path)) == -1) {
+ free(opts);
+ return (-1);
+ }
+ sc->sock = true;
+ sc->tty.fd = -1;
+ sc->usc_sock.servmev = mevent_add(sc->usc_sock.servfd, EVF_READ,
+ uart_sock_accept, sc);
+ assert(sc->usc_sock.servmev != NULL);
+
+ return (0);
+}
+#endif /* not __FreeBSD__ */
+
+int
+uart_set_backend(struct uart_softc *sc, const char *opts)
+{
+ int retval;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
+#endif
+
+ retval = -1;
+
+ if (opts == NULL)
+ return (0);
+
+ if (strcmp("stdio", opts) == 0) {
+ if (!uart_stdio) {
+ sc->tty.fd = STDIN_FILENO;
+ sc->tty.opened = true;
+ uart_stdio = true;
+ retval = 0;
+ }
+#ifndef __FreeBSD__
+ } else if (strncmp("socket,", opts, 7) == 0) {
+ return (uart_sock_backend(sc, opts));
+#endif
+ } else if (uart_tty_backend(sc, opts) == 0) {
+ retval = 0;
+ }
+
+ /* Make the backend file descriptor non-blocking */
+ if (retval == 0 && sc->tty.fd != -1)
+ retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK);
+
+ if (retval == 0) {
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ,
+ CAP_WRITE);
+ if (caph_rights_limit(sc->tty.fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_ioctls_limit(sc->tty.fd, cmds, nitems(cmds)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (!uart_stdio) {
+ if (caph_limit_stdin() == -1)
+ errx(EX_OSERR,
+ "Unable to apply rights for sandbox");
+ }
+#endif
+ uart_opentty(sc);
+ }
+
+ return (retval);
+}
diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h
new file mode 100644
index 0000000000..a87202df1f
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.h
@@ -0,0 +1,47 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _UART_EMUL_H_
+#define _UART_EMUL_H_
+
+
+#define UART_IO_BAR_SIZE 8
+
+struct uart_softc;
+
+typedef void (*uart_intr_func_t)(void *arg);
+struct uart_softc *uart_init(uart_intr_func_t intr_assert,
+ uart_intr_func_t intr_deassert, void *arg);
+
+int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
+uint8_t uart_read(struct uart_softc *sc, int offset);
+void uart_write(struct uart_softc *sc, int offset, uint8_t value);
+int uart_set_backend(struct uart_softc *sc, const char *opt);
+#endif
diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c
new file mode 100644
index 0000000000..6ecdd9530e
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_emul.c
@@ -0,0 +1,78 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "usb_emul.h"
+
+SET_DECLARE(usb_emu_set, struct usb_devemu);
+
+struct usb_devemu *
+usb_emu_finddev(char *name)
+{
+ struct usb_devemu **udpp, *udp;
+
+ SET_FOREACH(udpp, usb_emu_set) {
+ udp = *udpp;
+ if (!strcmp(udp->ue_emu, name))
+ return (udp);
+ }
+
+ return (NULL);
+}
+
+struct usb_data_xfer_block *
+usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen,
+ void *hci_data, int ccs)
+{
+ struct usb_data_xfer_block *xb;
+
+ if (xfer->ndata >= USB_MAX_XFER_BLOCKS)
+ return (NULL);
+
+ xb = &xfer->data[xfer->tail];
+ xb->buf = buf;
+ xb->blen = blen;
+ xb->hci_data = hci_data;
+ xb->ccs = ccs;
+ xb->processed = 0;
+ xb->bdone = 0;
+ xfer->ndata++;
+ xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS;
+ return (xb);
+}
diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h
new file mode 100644
index 0000000000..e55a421b6f
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_emul.h
@@ -0,0 +1,164 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _USB_EMUL_H_
+#define _USB_EMUL_H_
+
+#include <stdlib.h>
+#include <sys/linker_set.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <synch.h>
+#endif
+
+#define USB_MAX_XFER_BLOCKS 8
+
+#define USB_XFER_OUT 0
+#define USB_XFER_IN 1
+
+
+
+struct usb_hci;
+struct usb_device_request;
+struct usb_data_xfer;
+
+/* Device emulation handlers */
+struct usb_devemu {
+ char *ue_emu; /* name of device emulation */
+ int ue_usbver; /* usb version: 2 or 3 */
+ int ue_usbspeed; /* usb device speed */
+
+ /* instance creation */
+ void *(*ue_init)(struct usb_hci *hci, char *opt);
+
+ /* handlers */
+ int (*ue_request)(void *sc, struct usb_data_xfer *xfer);
+ int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir,
+ int epctx);
+ int (*ue_reset)(void *sc);
+ int (*ue_remove)(void *sc);
+ int (*ue_stop)(void *sc);
+};
+#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x);
+
+/*
+ * USB device events to notify HCI when state changes
+ */
+enum hci_usbev {
+ USBDEV_ATTACH,
+ USBDEV_RESET,
+ USBDEV_STOP,
+ USBDEV_REMOVE,
+};
+
+/* usb controller, ie xhci, ehci */
+struct usb_hci {
+ int (*hci_intr)(struct usb_hci *hci, int epctx);
+ int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid,
+ void *param);
+ void *hci_sc; /* private softc for hci */
+
+ /* controller managed fields */
+ int hci_address;
+ int hci_port;
+};
+
+/*
+ * Each xfer block is mapped to the hci transfer block.
+ * On input into the device handler, blen is set to the lenght of buf.
+ * The device handler is to update blen to reflect on the residual size
+ * of the buffer, i.e. len(buf) - len(consumed).
+ */
+struct usb_data_xfer_block {
+ void *buf; /* IN or OUT pointer */
+ int blen; /* in:len(buf), out:len(remaining) */
+ int bdone; /* bytes transferred */
+ uint32_t processed; /* device processed this + errcode */
+ void *hci_data; /* HCI private reference */
+ int ccs;
+ uint32_t streamid;
+ uint64_t trbnext; /* next TRB guest address */
+};
+
+struct usb_data_xfer {
+ struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS];
+ struct usb_device_request *ureq; /* setup ctl request */
+ int ndata; /* # of data items */
+ int head;
+ int tail;
+ pthread_mutex_t mtx;
+};
+
+enum USB_ERRCODE {
+ USB_ACK,
+ USB_NAK,
+ USB_STALL,
+ USB_NYET,
+ USB_ERR,
+ USB_SHORT
+};
+
+#define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8
+#define USB_DATA_SET_ERRCODE(x,e) do { \
+ (x)->processed = ((x)->processed & 0xFF) | (e << 8); \
+ } while (0)
+
+#define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL)
+
+#define USB_DATA_XFER_INIT(x) do { \
+ memset((x), 0, sizeof(*(x))); \
+ pthread_mutex_init(&((x)->mtx), NULL); \
+ } while (0)
+
+#define USB_DATA_XFER_RESET(x) do { \
+ memset((x)->data, 0, sizeof((x)->data)); \
+ (x)->ndata = 0; \
+ (x)->head = (x)->tail = 0; \
+ } while (0)
+
+#define USB_DATA_XFER_LOCK(x) do { \
+ pthread_mutex_lock(&((x)->mtx)); \
+ } while (0)
+
+#define USB_DATA_XFER_UNLOCK(x) do { \
+ pthread_mutex_unlock(&((x)->mtx)); \
+ } while (0)
+#ifndef __FreeBSD__
+#define USB_DATA_XFER_LOCK_HELD(x) MUTEX_HELD(&((x)->mtx))
+#endif
+
+struct usb_devemu *usb_emu_finddev(char *name);
+
+struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer,
+ void *buf, int blen, void *hci_data, int ccs);
+
+
+#endif /* _USB_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c
new file mode 100644
index 0000000000..e613012071
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_mouse.c
@@ -0,0 +1,802 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dev/usb/usb.h>
+#include <dev/usb/usbdi.h>
+
+#include "usb_emul.h"
+#include "console.h"
+#include "bhyvegc.h"
+
+static int umouse_debug = 0;
+#define DPRINTF(params) if (umouse_debug) printf params
+#define WPRINTF(params) printf params
+
+/* USB endpoint context (1-15) for reporting mouse data events*/
+#define UMOUSE_INTR_ENDPT 1
+
+#define UMOUSE_REPORT_DESC_TYPE 0x22
+
+#define UMOUSE_GET_REPORT 0x01
+#define UMOUSE_GET_IDLE 0x02
+#define UMOUSE_GET_PROTOCOL 0x03
+#define UMOUSE_SET_REPORT 0x09
+#define UMOUSE_SET_IDLE 0x0A
+#define UMOUSE_SET_PROTOCOL 0x0B
+
+#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
+
+enum {
+ UMSTR_LANG,
+ UMSTR_MANUFACTURER,
+ UMSTR_PRODUCT,
+ UMSTR_SERIAL,
+ UMSTR_CONFIG,
+ UMSTR_MAX
+};
+
+static const char *umouse_desc_strings[] = {
+ "\x04\x09",
+ "BHYVE",
+ "HID Tablet",
+ "01",
+ "HID Tablet Device",
+};
+
+struct umouse_hid_descriptor {
+ uint8_t bLength;
+ uint8_t bDescriptorType;
+ uint8_t bcdHID[2];
+ uint8_t bCountryCode;
+ uint8_t bNumDescriptors;
+ uint8_t bReportDescriptorType;
+ uint8_t wItemLength[2];
+} __packed;
+
+struct umouse_config_desc {
+ struct usb_config_descriptor confd;
+ struct usb_interface_descriptor ifcd;
+ struct umouse_hid_descriptor hidd;
+ struct usb_endpoint_descriptor endpd;
+ struct usb_endpoint_ss_comp_descriptor sscompd;
+} __packed;
+
+#define MOUSE_MAX_X 0x8000
+#define MOUSE_MAX_Y 0x8000
+
+static const uint8_t umouse_report_desc[] = {
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x02, /* USAGE (Mouse) */
+ 0xa1, 0x01, /* COLLECTION (Application) */
+ 0x09, 0x01, /* USAGE (Pointer) */
+ 0xa1, 0x00, /* COLLECTION (Physical) */
+ 0x05, 0x09, /* USAGE_PAGE (Button) */
+ 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */
+ 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
+ 0x75, 0x01, /* REPORT_SIZE (1) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */
+ 0x75, 0x05, /* REPORT_SIZE (5) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x30, /* USAGE (X) */
+ 0x09, 0x31, /* USAGE (Y) */
+ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */
+ 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */
+ 0x75, 0x10, /* REPORT_SIZE (16) */
+ 0x95, 0x02, /* REPORT_COUNT (2) */
+ 0x81, 0x02, /* INPUT (Data,Var,Abs) */
+ 0x05, 0x01, /* USAGE Page (Generic Desktop) */
+ 0x09, 0x38, /* USAGE (Wheel) */
+ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */
+ 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */
+ 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */
+ 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */
+ 0x75, 0x08, /* REPORT_SIZE (8) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x81, 0x06, /* INPUT (Data,Var,Rel) */
+ 0xc0, /* END_COLLECTION */
+ 0xc0 /* END_COLLECTION */
+};
+
+struct umouse_report {
+ uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */
+ int16_t x; /* x position */
+ int16_t y; /* y position */
+ int8_t z; /* z wheel position */
+} __packed;
+
+
+#define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
+
+static struct usb_device_descriptor umouse_dev_desc = {
+ .bLength = sizeof(umouse_dev_desc),
+ .bDescriptorType = UDESC_DEVICE,
+ MSETW(.bcdUSB, UD_USB_3_0),
+ .bMaxPacketSize = 8, /* max packet size */
+ MSETW(.idVendor, 0xFB5D), /* vendor */
+ MSETW(.idProduct, 0x0001), /* product */
+ MSETW(.bcdDevice, 0), /* device version */
+ .iManufacturer = UMSTR_MANUFACTURER,
+ .iProduct = UMSTR_PRODUCT,
+ .iSerialNumber = UMSTR_SERIAL,
+ .bNumConfigurations = 1,
+};
+
+static struct umouse_config_desc umouse_confd = {
+ .confd = {
+ .bLength = sizeof(umouse_confd.confd),
+ .bDescriptorType = UDESC_CONFIG,
+ .wTotalLength[0] = sizeof(umouse_confd),
+ .bNumInterface = 1,
+ .bConfigurationValue = 1,
+ .iConfiguration = UMSTR_CONFIG,
+ .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP,
+ .bMaxPower = 0,
+ },
+ .ifcd = {
+ .bLength = sizeof(umouse_confd.ifcd),
+ .bDescriptorType = UDESC_INTERFACE,
+ .bNumEndpoints = 1,
+ .bInterfaceClass = UICLASS_HID,
+ .bInterfaceSubClass = UISUBCLASS_BOOT,
+ .bInterfaceProtocol = UIPROTO_MOUSE,
+ },
+ .hidd = {
+ .bLength = sizeof(umouse_confd.hidd),
+ .bDescriptorType = 0x21,
+ .bcdHID = { 0x01, 0x10 },
+ .bCountryCode = 0,
+ .bNumDescriptors = 1,
+ .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE,
+ .wItemLength = { sizeof(umouse_report_desc), 0 },
+ },
+ .endpd = {
+ .bLength = sizeof(umouse_confd.endpd),
+ .bDescriptorType = UDESC_ENDPOINT,
+ .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT,
+ .bmAttributes = UE_INTERRUPT,
+ .wMaxPacketSize[0] = 8,
+ .bInterval = 0xA,
+ },
+ .sscompd = {
+ .bLength = sizeof(umouse_confd.sscompd),
+ .bDescriptorType = UDESC_ENDPOINT_SS_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ MSETW(.wBytesPerInterval, 0),
+ },
+};
+
+
+struct umouse_bos_desc {
+ struct usb_bos_descriptor bosd;
+ struct usb_devcap_ss_descriptor usbssd;
+} __packed;
+
+
+struct umouse_bos_desc umouse_bosd = {
+ .bosd = {
+ .bLength = sizeof(umouse_bosd.bosd),
+ .bDescriptorType = UDESC_BOS,
+ HSETW(.wTotalLength, sizeof(umouse_bosd)),
+ .bNumDeviceCaps = 1,
+ },
+ .usbssd = {
+ .bLength = sizeof(umouse_bosd.usbssd),
+ .bDescriptorType = UDESC_DEVICE_CAPABILITY,
+ .bDevCapabilityType = 3,
+ .bmAttributes = 0,
+ HSETW(.wSpeedsSupported, 0x08),
+ .bFunctionalitySupport = 3,
+ .bU1DevExitLat = 0xa, /* dummy - not used */
+ .wU2DevExitLat = { 0x20, 0x00 },
+ }
+};
+
+
+struct umouse_softc {
+ struct usb_hci *hci;
+
+ char *opt;
+
+ struct umouse_report um_report;
+ int newdata;
+ struct {
+ uint8_t idle;
+ uint8_t protocol;
+ uint8_t feature;
+ } hid;
+
+ pthread_mutex_t mtx;
+ pthread_mutex_t ev_mtx;
+ int polling;
+ struct timeval prev_evt;
+};
+
+static void
+umouse_event(uint8_t button, int x, int y, void *arg)
+{
+ struct umouse_softc *sc;
+ struct bhyvegc_image *gc;
+
+ gc = console_get_image();
+ if (gc == NULL) {
+ /* not ready */
+ return;
+ }
+
+ sc = arg;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ sc->um_report.buttons = 0;
+ sc->um_report.z = 0;
+
+ if (button & 0x01)
+ sc->um_report.buttons |= 0x01; /* left */
+ if (button & 0x02)
+ sc->um_report.buttons |= 0x04; /* middle */
+ if (button & 0x04)
+ sc->um_report.buttons |= 0x02; /* right */
+ if (button & 0x8)
+ sc->um_report.z = 1;
+ if (button & 0x10)
+ sc->um_report.z = -1;
+
+ /* scale coords to mouse resolution */
+ sc->um_report.x = MOUSE_MAX_X * x / gc->width;
+ sc->um_report.y = MOUSE_MAX_Y * y / gc->height;
+ sc->newdata = 1;
+ pthread_mutex_unlock(&sc->mtx);
+
+ pthread_mutex_lock(&sc->ev_mtx);
+ sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT);
+ pthread_mutex_unlock(&sc->ev_mtx);
+}
+
+static void *
+umouse_init(struct usb_hci *hci, char *opt)
+{
+ struct umouse_softc *sc;
+
+ sc = calloc(1, sizeof(struct umouse_softc));
+ sc->hci = hci;
+
+ sc->hid.protocol = 1; /* REPORT protocol */
+ sc->opt = strdup(opt);
+ pthread_mutex_init(&sc->mtx, NULL);
+ pthread_mutex_init(&sc->ev_mtx, NULL);
+
+ console_ptr_register(umouse_event, sc, 10);
+
+ return (sc);
+}
+
+#define UREQ(x,y) ((x) | ((y) << 8))
+
+static int
+umouse_request(void *scarg, struct usb_data_xfer *xfer)
+{
+ struct umouse_softc *sc;
+ struct usb_data_xfer_block *data;
+ const char *str;
+ uint16_t value;
+ uint16_t index;
+ uint16_t len;
+ uint16_t slen;
+ uint8_t *udata;
+ int err;
+ int i, idx;
+ int eshort;
+
+ sc = scarg;
+
+ data = NULL;
+ udata = NULL;
+ idx = xfer->head;
+ for (i = 0; i < xfer->ndata; i++) {
+ xfer->data[idx].bdone = 0;
+ if (data == NULL && USB_DATA_OK(xfer,i)) {
+ data = &xfer->data[idx];
+ udata = data->buf;
+ }
+
+ xfer->data[idx].processed = 1;
+ idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
+ }
+
+ err = USB_ERR_NORMAL_COMPLETION;
+ eshort = 0;
+
+ if (!xfer->ureq) {
+ DPRINTF(("umouse_request: port %d\r\n", sc->hci->hci_port));
+ goto done;
+ }
+
+ value = UGETW(xfer->ureq->wValue);
+ index = UGETW(xfer->ureq->wIndex);
+ len = UGETW(xfer->ureq->wLength);
+
+ DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, "
+ "idx 0x%x, len %u\r\n",
+ sc->hci->hci_port, xfer->ureq->bmRequestType,
+ xfer->ureq->bRequest, value, index, len));
+
+ switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) {
+ case UREQ(UR_GET_CONFIG, UT_READ_DEVICE):
+ DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)\r\n"));
+ if (!data)
+ break;
+
+ *udata = umouse_confd.confd.bConfigurationValue;
+ data->blen = len > 0 ? len - 1 : 0;
+ eshort = data->blen > 0;
+ data->bdone += 1;
+ break;
+
+ case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
+ DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x\r\n",
+ value >> 8));
+ if (!data)
+ break;
+
+ switch (value >> 8) {
+ case UDESC_DEVICE:
+ DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= "
+ "sizeof(umouse_dev_desc) %lu\r\n",
+ len, sizeof(umouse_dev_desc)));
+ if ((value & 0xFF) != 0) {
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+ if (len > sizeof(umouse_dev_desc)) {
+ data->blen = len - sizeof(umouse_dev_desc);
+ len = sizeof(umouse_dev_desc);
+ } else
+ data->blen = 0;
+ memcpy(data->buf, &umouse_dev_desc, len);
+ data->bdone += len;
+ break;
+
+ case UDESC_CONFIG:
+ DPRINTF(("umouse: (->UDESC_CONFIG)\r\n"));
+ if ((value & 0xFF) != 0) {
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+ if (len > sizeof(umouse_confd)) {
+ data->blen = len - sizeof(umouse_confd);
+ len = sizeof(umouse_confd);
+ } else
+ data->blen = 0;
+
+ memcpy(data->buf, &umouse_confd, len);
+ data->bdone += len;
+ break;
+
+ case UDESC_STRING:
+ DPRINTF(("umouse: (->UDESC_STRING)\r\n"));
+ str = NULL;
+ if ((value & 0xFF) < UMSTR_MAX)
+ str = umouse_desc_strings[value & 0xFF];
+ else
+ goto done;
+
+ if ((value & 0xFF) == UMSTR_LANG) {
+ udata[0] = 4;
+ udata[1] = UDESC_STRING;
+ data->blen = len - 2;
+ len -= 2;
+ data->bdone += 2;
+
+ if (len >= 2) {
+ udata[2] = str[0];
+ udata[3] = str[1];
+ data->blen -= 2;
+ data->bdone += 2;
+ } else
+ data->blen = 0;
+
+ goto done;
+ }
+
+ slen = 2 + strlen(str) * 2;
+ udata[0] = slen;
+ udata[1] = UDESC_STRING;
+
+ if (len > slen) {
+ data->blen = len - slen;
+ len = slen;
+ } else
+ data->blen = 0;
+ for (i = 2; i < len; i += 2) {
+ udata[i] = *str++;
+ udata[i+1] = '\0';
+ }
+ data->bdone += slen;
+
+ break;
+
+ case UDESC_BOS:
+ DPRINTF(("umouse: USB3 BOS\r\n"));
+ if (len > sizeof(umouse_bosd)) {
+ data->blen = len - sizeof(umouse_bosd);
+ len = sizeof(umouse_bosd);
+ } else
+ data->blen = 0;
+ memcpy(udata, &umouse_bosd, len);
+ data->bdone += len;
+ break;
+
+ default:
+ DPRINTF(("umouse: unknown(%d)->ERROR\r\n", value >> 8));
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE):
+ DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) "
+ "0x%x\r\n", (value >> 8)));
+ if (!data)
+ break;
+
+ switch (value >> 8) {
+ case UMOUSE_REPORT_DESC_TYPE:
+ if (len > sizeof(umouse_report_desc)) {
+ data->blen = len - sizeof(umouse_report_desc);
+ len = sizeof(umouse_report_desc);
+ } else
+ data->blen = 0;
+ memcpy(data->buf, umouse_report_desc, len);
+ data->bdone += len;
+ break;
+ default:
+ DPRINTF(("umouse: IO ERROR\r\n"));
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE):
+ DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)\r\n"));
+ if (index != 0) {
+ DPRINTF(("umouse get_interface, invalid index %d\r\n",
+ index));
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+
+ if (!data)
+ break;
+
+ if (len > 0) {
+ *udata = 0;
+ data->blen = len - 1;
+ }
+ eshort = data->blen > 0;
+ data->bdone += 1;
+ break;
+
+ case UREQ(UR_GET_STATUS, UT_READ_DEVICE):
+ DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)\r\n"));
+ if (data != NULL && len > 1) {
+ if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP)
+ USETW(udata, UDS_REMOTE_WAKEUP);
+ else
+ USETW(udata, 0);
+ data->blen = len - 2;
+ data->bdone += 2;
+ }
+
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UR_GET_STATUS, UT_READ_INTERFACE):
+ case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT):
+ DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)\r\n"));
+ if (data != NULL && len > 1) {
+ USETW(udata, 0);
+ data->blen = len - 2;
+ data->bdone += 2;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE):
+ /* XXX Controller should've handled this */
+ DPRINTF(("umouse set address %u\r\n", value));
+ break;
+
+ case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE):
+ DPRINTF(("umouse set config %u\r\n", value));
+ break;
+
+ case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
+ DPRINTF(("umouse set descriptor %u\r\n", value));
+ break;
+
+
+ case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
+ DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value));
+ if (value == UF_DEVICE_REMOTE_WAKEUP)
+ sc->hid.feature = 0;
+ break;
+
+ case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE):
+ DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value));
+ if (value == UF_DEVICE_REMOTE_WAKEUP)
+ sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP;
+ break;
+
+ case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
+ case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
+ case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE):
+ case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
+ DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)\r\n"));
+ err = USB_ERR_IOERROR;
+ goto done;
+
+ case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
+ DPRINTF(("umouse set interface %u\r\n", value));
+ break;
+
+ case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE):
+ DPRINTF(("umouse set isoch delay %u\r\n", value));
+ break;
+
+ case UREQ(UR_SET_SEL, 0):
+ DPRINTF(("umouse set sel\r\n"));
+ break;
+
+ case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
+ DPRINTF(("umouse synch frame\r\n"));
+ break;
+
+ /* HID device requests */
+
+ case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE):
+ DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) "
+ "0x%x\r\n", (value >> 8)));
+ if (!data)
+ break;
+
+ if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) {
+ /* TODO read from backend */
+
+ if (len > sizeof(sc->um_report)) {
+ data->blen = len - sizeof(sc->um_report);
+ len = sizeof(sc->um_report);
+ } else
+ data->blen = 0;
+
+ memcpy(data->buf, &sc->um_report, len);
+ data->bdone += len;
+ } else {
+ err = USB_ERR_IOERROR;
+ goto done;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE):
+ if (data != NULL && len > 0) {
+ *udata = sc->hid.idle;
+ data->blen = len - 1;
+ data->bdone += 1;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE):
+ if (data != NULL && len > 0) {
+ *udata = sc->hid.protocol;
+ data->blen = len - 1;
+ data->bdone += 1;
+ }
+ eshort = data->blen > 0;
+ break;
+
+ case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE):
+ DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored\r\n"));
+ break;
+
+ case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE):
+ sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8;
+ DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x\r\n",
+ sc->hid.idle));
+ break;
+
+ case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE):
+ sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8;
+ DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x\r\n",
+ sc->hid.protocol));
+ break;
+
+ default:
+ DPRINTF(("**** umouse request unhandled\r\n"));
+ err = USB_ERR_IOERROR;
+ break;
+ }
+
+done:
+ if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) &&
+ (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL))
+ data->blen = 0;
+ else if (eshort)
+ err = USB_ERR_SHORT_XFER;
+
+ DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u\r\n",
+ err, (data ? data->blen : 0), (data ? data->bdone : 0)));
+
+ return (err);
+}
+
+static int
+umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir,
+ int epctx)
+{
+ struct umouse_softc *sc;
+ struct usb_data_xfer_block *data;
+ uint8_t *udata;
+ int len, i, idx;
+ int err;
+
+ DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d\r\n",
+ dir ? "IN" : "OUT", epctx, xfer->data[0].blen));
+
+
+ /* find buffer to add data */
+ udata = NULL;
+ err = USB_ERR_NORMAL_COMPLETION;
+
+ /* handle xfer at first unprocessed item with buffer */
+ data = NULL;
+ idx = xfer->head;
+ for (i = 0; i < xfer->ndata; i++) {
+ data = &xfer->data[idx];
+ if (data->buf != NULL && data->blen != 0) {
+ break;
+ } else {
+ data->processed = 1;
+ data = NULL;
+ }
+ idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
+ }
+ if (!data)
+ goto done;
+
+ udata = data->buf;
+ len = data->blen;
+
+ if (udata == NULL) {
+ DPRINTF(("umouse no buffer provided for input\r\n"));
+ err = USB_ERR_NOMEM;
+ goto done;
+ }
+
+ sc = scarg;
+
+ if (dir) {
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (!sc->newdata) {
+ err = USB_ERR_CANCELLED;
+ USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK);
+ pthread_mutex_unlock(&sc->mtx);
+ goto done;
+ }
+
+ if (sc->polling) {
+ err = USB_ERR_STALLED;
+ USB_DATA_SET_ERRCODE(data, USB_STALL);
+ pthread_mutex_unlock(&sc->mtx);
+ goto done;
+ }
+ sc->polling = 1;
+
+ if (len > 0) {
+ sc->newdata = 0;
+
+ data->processed = 1;
+ data->bdone += 6;
+ memcpy(udata, &sc->um_report, 6);
+ data->blen = len - 6;
+ if (data->blen > 0)
+ err = USB_ERR_SHORT_XFER;
+ }
+
+ sc->polling = 0;
+ pthread_mutex_unlock(&sc->mtx);
+ } else {
+ USB_DATA_SET_ERRCODE(data, USB_STALL);
+ err = USB_ERR_STALLED;
+ }
+
+done:
+ return (err);
+}
+
+static int
+umouse_reset(void *scarg)
+{
+ struct umouse_softc *sc;
+
+ sc = scarg;
+
+ sc->newdata = 0;
+
+ return (0);
+}
+
+static int
+umouse_remove(void *scarg)
+{
+
+ return (0);
+}
+
+static int
+umouse_stop(void *scarg)
+{
+
+ return (0);
+}
+
+
+struct usb_devemu ue_mouse = {
+ .ue_emu = "tablet",
+ .ue_usbver = 3,
+ .ue_usbspeed = USB_SPEED_HIGH,
+ .ue_init = umouse_init,
+ .ue_request = umouse_request,
+ .ue_data = umouse_data_handler,
+ .ue_reset = umouse_reset,
+ .ue_remove = umouse_remove,
+ .ue_stop = umouse_stop
+};
+USB_EMUL_SET(ue_mouse);
diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c
new file mode 100644
index 0000000000..314ddeb1e8
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.c
@@ -0,0 +1,1357 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/vmm.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "inout.h"
+#include "mem.h"
+#include "vga.h"
+
+#define KB (1024UL)
+#define MB (1024 * 1024UL)
+
+struct vga_softc {
+ struct mem_range mr;
+
+ struct bhyvegc *gc;
+ int gc_width;
+ int gc_height;
+ struct bhyvegc_image *gc_image;
+
+ uint8_t *vga_ram;
+
+ /*
+ * General registers
+ */
+ uint8_t vga_misc;
+ uint8_t vga_sts1;
+
+ /*
+ * Sequencer
+ */
+ struct {
+ int seq_index;
+ uint8_t seq_reset;
+ uint8_t seq_clock_mode;
+ int seq_cm_dots;
+ uint8_t seq_map_mask;
+ uint8_t seq_cmap_sel;
+ int seq_cmap_pri_off;
+ int seq_cmap_sec_off;
+ uint8_t seq_mm;
+ } vga_seq;
+
+ /*
+ * CRT Controller
+ */
+ struct {
+ int crtc_index;
+ uint8_t crtc_mode_ctrl;
+ uint8_t crtc_horiz_total;
+ uint8_t crtc_horiz_disp_end;
+ uint8_t crtc_start_horiz_blank;
+ uint8_t crtc_end_horiz_blank;
+ uint8_t crtc_start_horiz_retrace;
+ uint8_t crtc_end_horiz_retrace;
+ uint8_t crtc_vert_total;
+ uint8_t crtc_overflow;
+ uint8_t crtc_present_row_scan;
+ uint8_t crtc_max_scan_line;
+ uint8_t crtc_cursor_start;
+ uint8_t crtc_cursor_on;
+ uint8_t crtc_cursor_end;
+ uint8_t crtc_start_addr_high;
+ uint8_t crtc_start_addr_low;
+ uint16_t crtc_start_addr;
+ uint8_t crtc_cursor_loc_low;
+ uint8_t crtc_cursor_loc_high;
+ uint16_t crtc_cursor_loc;
+ uint8_t crtc_vert_retrace_start;
+ uint8_t crtc_vert_retrace_end;
+ uint8_t crtc_vert_disp_end;
+ uint8_t crtc_offset;
+ uint8_t crtc_underline_loc;
+ uint8_t crtc_start_vert_blank;
+ uint8_t crtc_end_vert_blank;
+ uint8_t crtc_line_compare;
+ } vga_crtc;
+
+ /*
+ * Graphics Controller
+ */
+ struct {
+ int gc_index;
+ uint8_t gc_set_reset;
+ uint8_t gc_enb_set_reset;
+ uint8_t gc_color_compare;
+ uint8_t gc_rotate;
+ uint8_t gc_op;
+ uint8_t gc_read_map_sel;
+ uint8_t gc_mode;
+ bool gc_mode_c4; /* chain 4 */
+ bool gc_mode_oe; /* odd/even */
+ uint8_t gc_mode_rm; /* read mode */
+ uint8_t gc_mode_wm; /* write mode */
+ uint8_t gc_misc;
+ uint8_t gc_misc_gm; /* graphics mode */
+ uint8_t gc_misc_mm; /* memory map */
+ uint8_t gc_color_dont_care;
+ uint8_t gc_bit_mask;
+ uint8_t gc_latch0;
+ uint8_t gc_latch1;
+ uint8_t gc_latch2;
+ uint8_t gc_latch3;
+ } vga_gc;
+
+ /*
+ * Attribute Controller
+ */
+ struct {
+ int atc_flipflop;
+ int atc_index;
+ uint8_t atc_palette[16];
+ uint8_t atc_mode;
+ uint8_t atc_overscan_color;
+ uint8_t atc_color_plane_enb;
+ uint8_t atc_horiz_pixel_panning;
+ uint8_t atc_color_select;
+ uint8_t atc_color_select_45;
+ uint8_t atc_color_select_67;
+ } vga_atc;
+
+ /*
+ * DAC
+ */
+ struct {
+ uint8_t dac_state;
+ uint8_t dac_rd_index;
+ uint8_t dac_rd_subindex;
+ uint8_t dac_wr_index;
+ uint8_t dac_wr_subindex;
+ uint8_t dac_palette[3 * 256];
+ uint32_t dac_palette_rgb[256];
+ } vga_dac;
+};
+
+static bool
+vga_in_reset(struct vga_softc *sc)
+{
+ return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) ||
+ ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) ||
+ ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) ||
+ ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0));
+}
+
+static void
+vga_check_size(struct bhyvegc *gc, struct vga_softc *sc)
+{
+ int old_width, old_height;
+
+ if (vga_in_reset(sc))
+ return;
+
+ //old_width = sc->gc_width;
+ //old_height = sc->gc_height;
+ old_width = sc->gc_image->width;
+ old_height = sc->gc_image->height;
+
+ /*
+ * Horizontal Display End: For text modes this is the number
+ * of characters. For graphics modes this is the number of
+ * pixels per scanlines divided by the number of pixels per
+ * character clock.
+ */
+ sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) *
+ sc->vga_seq.seq_cm_dots;
+
+ sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end |
+ (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) |
+ (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1;
+
+ if (old_width != sc->gc_width || old_height != sc->gc_height)
+ bhyvegc_resize(gc, sc->gc_width, sc->gc_height);
+}
+
+static uint32_t
+vga_get_pixel(struct vga_softc *sc, int x, int y)
+{
+ int offset;
+ int bit;
+ uint8_t data;
+ uint8_t idx;
+
+ offset = (y * sc->gc_width / 8) + (x / 8);
+ bit = 7 - (x % 8);
+
+ data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) |
+ (((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) |
+ (((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) |
+ (((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3);
+
+ data &= sc->vga_atc.atc_color_plane_enb;
+
+ if (sc->vga_atc.atc_mode & ATC_MC_IPS) {
+ idx = sc->vga_atc.atc_palette[data] & 0x0f;
+ idx |= sc->vga_atc.atc_color_select_45;
+ } else {
+ idx = sc->vga_atc.atc_palette[data];
+ }
+ idx |= sc->vga_atc.atc_color_select_67;
+
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_graphics(struct vga_softc *sc)
+{
+ int x, y;
+
+ for (y = 0; y < sc->gc_height; y++) {
+ for (x = 0; x < sc->gc_width; x++) {
+ int offset;
+
+ offset = y * sc->gc_width + x;
+ sc->gc_image->data[offset] = vga_get_pixel(sc, x, y);
+ }
+ }
+}
+
+static uint32_t
+vga_get_text_pixel(struct vga_softc *sc, int x, int y)
+{
+ int dots, offset, bit, font_offset;
+ uint8_t ch, attr, font;
+ uint8_t idx;
+
+ dots = sc->vga_seq.seq_cm_dots;
+
+ offset = 2 * sc->vga_crtc.crtc_start_addr;
+ offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2;
+
+ bit = 7 - (x % dots > 7 ? 7 : x % dots);
+
+ ch = sc->vga_ram[offset + 0 * 64*KB];
+ attr = sc->vga_ram[offset + 1 * 64*KB];
+
+ if (sc->vga_crtc.crtc_cursor_on &&
+ (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) &&
+ ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) &&
+ ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) {
+ idx = sc->vga_atc.atc_palette[attr & 0xf];
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+ }
+
+ if ((sc->vga_seq.seq_mm & SEQ_MM_EM) &&
+ sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) {
+ if (attr & 0x8)
+ font_offset = sc->vga_seq.seq_cmap_pri_off +
+ (ch << 5) + y % 16;
+ else
+ font_offset = sc->vga_seq.seq_cmap_sec_off +
+ (ch << 5) + y % 16;
+ attr &= ~0x8;
+ } else {
+ font_offset = (ch << 5) + y % 16;
+ }
+
+ font = sc->vga_ram[font_offset + 2 * 64*KB];
+
+ if (font & (1 << bit))
+ idx = sc->vga_atc.atc_palette[attr & 0xf];
+ else
+ idx = sc->vga_atc.atc_palette[attr >> 4];
+
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_text(struct vga_softc *sc)
+{
+ int x, y;
+
+ for (y = 0; y < sc->gc_height; y++) {
+ for (x = 0; x < sc->gc_width; x++) {
+ int offset;
+
+ offset = y * sc->gc_width + x;
+ sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y);
+ }
+ }
+}
+
+void
+vga_render(struct bhyvegc *gc, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ vga_check_size(gc, sc);
+
+ if (vga_in_reset(sc)) {
+ memset(sc->gc_image->data, 0,
+ sc->gc_image->width * sc->gc_image->height *
+ sizeof (uint32_t));
+ return;
+ }
+
+ if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA))
+ vga_render_graphics(sc);
+ else
+ vga_render_text(sc);
+}
+
+static uint64_t
+vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1)
+{
+ struct vga_softc *sc = arg1;
+ uint8_t map_sel;
+ int offset;
+
+ offset = addr;
+ switch (sc->vga_gc.gc_misc_mm) {
+ case 0x0:
+ /*
+ * extended mode: base 0xa0000 size 128k
+ */
+ offset -=0xa0000;
+ offset &= (128 * KB - 1);
+ break;
+ case 0x1:
+ /*
+ * EGA/VGA mode: base 0xa0000 size 64k
+ */
+ offset -=0xa0000;
+ offset &= (64 * KB - 1);
+ break;
+ case 0x2:
+ /*
+ * monochrome text mode: base 0xb0000 size 32kb
+ */
+#ifdef __FreeBSD__
+ assert(0);
+#else
+ abort();
+#endif
+ case 0x3:
+ /*
+ * color text mode and CGA: base 0xb8000 size 32kb
+ */
+ offset -=0xb8000;
+ offset &= (32 * KB - 1);
+ break;
+ }
+
+ /* Fill latches. */
+ sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB];
+ sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB];
+ sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB];
+ sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB];
+
+ if (sc->vga_gc.gc_mode_rm) {
+ /* read mode 1 */
+ assert(0);
+ }
+
+ map_sel = sc->vga_gc.gc_read_map_sel;
+ if (sc->vga_gc.gc_mode_oe) {
+ map_sel |= (offset & 1);
+ offset &= ~1;
+ }
+
+ /* read mode 0: return the byte from the selected plane. */
+ offset += map_sel * 64*KB;
+
+ return (sc->vga_ram[offset]);
+}
+
+static void
+vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1)
+{
+ struct vga_softc *sc = arg1;
+ uint8_t c0, c1, c2, c3;
+ uint8_t m0, m1, m2, m3;
+ uint8_t set_reset;
+ uint8_t enb_set_reset;
+ uint8_t mask;
+ int offset;
+
+ offset = addr;
+ switch (sc->vga_gc.gc_misc_mm) {
+ case 0x0:
+ /*
+ * extended mode: base 0xa0000 size 128kb
+ */
+ offset -=0xa0000;
+ offset &= (128 * KB - 1);
+ break;
+ case 0x1:
+ /*
+ * EGA/VGA mode: base 0xa0000 size 64kb
+ */
+ offset -=0xa0000;
+ offset &= (64 * KB - 1);
+ break;
+ case 0x2:
+ /*
+ * monochrome text mode: base 0xb0000 size 32kb
+ */
+#ifdef __FreeBSD__
+ assert(0);
+#else
+ abort();
+#endif
+ case 0x3:
+ /*
+ * color text mode and CGA: base 0xb8000 size 32kb
+ */
+ offset -=0xb8000;
+ offset &= (32 * KB - 1);
+ break;
+ }
+
+ set_reset = sc->vga_gc.gc_set_reset;
+ enb_set_reset = sc->vga_gc.gc_enb_set_reset;
+
+ c0 = sc->vga_gc.gc_latch0;
+ c1 = sc->vga_gc.gc_latch1;
+ c2 = sc->vga_gc.gc_latch2;
+ c3 = sc->vga_gc.gc_latch3;
+
+ switch (sc->vga_gc.gc_mode_wm) {
+ case 0:
+ /* write mode 0 */
+ mask = sc->vga_gc.gc_bit_mask;
+
+ val = (val >> sc->vga_gc.gc_rotate) |
+ (val << (8 - sc->vga_gc.gc_rotate));
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (set_reset & 1) ? mask : 0x00;
+ m1 = (set_reset & 2) ? mask : 0x00;
+ m2 = (set_reset & 4) ? mask : 0x00;
+ m3 = (set_reset & 8) ? mask : 0x00;
+
+ c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask);
+ c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask);
+ c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask);
+ c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask);
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = set_reset & 1 ? 0xff : ~mask;
+ m1 = set_reset & 2 ? 0xff : ~mask;
+ m2 = set_reset & 4 ? 0xff : ~mask;
+ m3 = set_reset & 8 ? 0xff : ~mask;
+
+ c0 = enb_set_reset & 1 ? c0 & m0 : val & m0;
+ c1 = enb_set_reset & 2 ? c1 & m1 : val & m1;
+ c2 = enb_set_reset & 4 ? c2 & m2 : val & m2;
+ c3 = enb_set_reset & 8 ? c3 & m3 : val & m3;
+ break;
+ case 0x10: /* OR */
+ m0 = set_reset & 1 ? mask : 0x00;
+ m1 = set_reset & 2 ? mask : 0x00;
+ m2 = set_reset & 4 ? mask : 0x00;
+ m3 = set_reset & 8 ? mask : 0x00;
+
+ c0 = enb_set_reset & 1 ? c0 | m0 : val | m0;
+ c1 = enb_set_reset & 2 ? c1 | m1 : val | m1;
+ c2 = enb_set_reset & 4 ? c2 | m2 : val | m2;
+ c3 = enb_set_reset & 8 ? c3 | m3 : val | m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = set_reset & 1 ? mask : 0x00;
+ m1 = set_reset & 2 ? mask : 0x00;
+ m2 = set_reset & 4 ? mask : 0x00;
+ m3 = set_reset & 8 ? mask : 0x00;
+
+ c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0;
+ c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1;
+ c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2;
+ c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3;
+ break;
+ }
+ break;
+ case 1:
+ /* write mode 1 */
+ break;
+ case 2:
+ /* write mode 2 */
+ mask = sc->vga_gc.gc_bit_mask;
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 &= ~mask;
+ c1 &= ~mask;
+ c2 &= ~mask;
+ c3 &= ~mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = (val & 1 ? 0xff : 0x00) | ~mask;
+ m1 = (val & 2 ? 0xff : 0x00) | ~mask;
+ m2 = (val & 4 ? 0xff : 0x00) | ~mask;
+ m3 = (val & 8 ? 0xff : 0x00) | ~mask;
+
+ c0 &= m0;
+ c1 &= m1;
+ c2 &= m2;
+ c3 &= m3;
+ break;
+ case 0x10: /* OR */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 ^= m0;
+ c1 ^= m1;
+ c2 ^= m2;
+ c3 ^= m3;
+ break;
+ }
+ break;
+ case 3:
+ /* write mode 3 */
+ mask = sc->vga_gc.gc_bit_mask & val;
+
+ val = (val >> sc->vga_gc.gc_rotate) |
+ (val << (8 - sc->vga_gc.gc_rotate));
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 &= ~mask;
+ c1 &= ~mask;
+ c2 &= ~mask;
+ c3 &= ~mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask;
+
+ c0 &= m0;
+ c1 &= m1;
+ c2 &= m2;
+ c3 &= m3;
+ break;
+ case 0x10: /* OR */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 ^= m0;
+ c1 ^= m1;
+ c2 ^= m2;
+ c3 ^= m3;
+ break;
+ }
+ break;
+ }
+
+ if (sc->vga_gc.gc_mode_oe) {
+ if (offset & 1) {
+ offset &= ~1;
+ if (sc->vga_seq.seq_map_mask & 2)
+ sc->vga_ram[offset + 1*64*KB] = c1;
+ if (sc->vga_seq.seq_map_mask & 8)
+ sc->vga_ram[offset + 3*64*KB] = c3;
+ } else {
+ if (sc->vga_seq.seq_map_mask & 1)
+ sc->vga_ram[offset + 0*64*KB] = c0;
+ if (sc->vga_seq.seq_map_mask & 4)
+ sc->vga_ram[offset + 2*64*KB] = c2;
+ }
+ } else {
+ if (sc->vga_seq.seq_map_mask & 1)
+ sc->vga_ram[offset + 0*64*KB] = c0;
+ if (sc->vga_seq.seq_map_mask & 2)
+ sc->vga_ram[offset + 1*64*KB] = c1;
+ if (sc->vga_seq.seq_map_mask & 4)
+ sc->vga_ram[offset + 2*64*KB] = c2;
+ if (sc->vga_seq.seq_map_mask & 8)
+ sc->vga_ram[offset + 3*64*KB] = c3;
+ }
+}
+
+static int
+vga_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ if (dir == MEM_F_WRITE) {
+ switch (size) {
+ case 1:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ break;
+ case 2:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ break;
+ case 4:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+ vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+ break;
+ case 8:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+ vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+ vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1);
+ vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1);
+ vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1);
+ vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1);
+ break;
+ }
+ } else {
+ switch (size) {
+ case 1:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ break;
+ case 2:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ break;
+ case 4:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+ *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+ break;
+ case 8:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+ *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+ *val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32;
+ *val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40;
+ *val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48;
+ *val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes,
+ uint8_t *val, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ switch (port) {
+ case CRTC_IDX_MONO_PORT:
+ case CRTC_IDX_COLOR_PORT:
+ *val = sc->vga_crtc.crtc_index;
+ break;
+ case CRTC_DATA_MONO_PORT:
+ case CRTC_DATA_COLOR_PORT:
+ switch (sc->vga_crtc.crtc_index) {
+ case CRTC_HORIZ_TOTAL:
+ *val = sc->vga_crtc.crtc_horiz_total;
+ break;
+ case CRTC_HORIZ_DISP_END:
+ *val = sc->vga_crtc.crtc_horiz_disp_end;
+ break;
+ case CRTC_START_HORIZ_BLANK:
+ *val = sc->vga_crtc.crtc_start_horiz_blank;
+ break;
+ case CRTC_END_HORIZ_BLANK:
+ *val = sc->vga_crtc.crtc_end_horiz_blank;
+ break;
+ case CRTC_START_HORIZ_RETRACE:
+ *val = sc->vga_crtc.crtc_start_horiz_retrace;
+ break;
+ case CRTC_END_HORIZ_RETRACE:
+ *val = sc->vga_crtc.crtc_end_horiz_retrace;
+ break;
+ case CRTC_VERT_TOTAL:
+ *val = sc->vga_crtc.crtc_vert_total;
+ break;
+ case CRTC_OVERFLOW:
+ *val = sc->vga_crtc.crtc_overflow;
+ break;
+ case CRTC_PRESET_ROW_SCAN:
+ *val = sc->vga_crtc.crtc_present_row_scan;
+ break;
+ case CRTC_MAX_SCAN_LINE:
+ *val = sc->vga_crtc.crtc_max_scan_line;
+ break;
+ case CRTC_CURSOR_START:
+ *val = sc->vga_crtc.crtc_cursor_start;
+ break;
+ case CRTC_CURSOR_END:
+ *val = sc->vga_crtc.crtc_cursor_end;
+ break;
+ case CRTC_START_ADDR_HIGH:
+ *val = sc->vga_crtc.crtc_start_addr_high;
+ break;
+ case CRTC_START_ADDR_LOW:
+ *val = sc->vga_crtc.crtc_start_addr_low;
+ break;
+ case CRTC_CURSOR_LOC_HIGH:
+ *val = sc->vga_crtc.crtc_cursor_loc_high;
+ break;
+ case CRTC_CURSOR_LOC_LOW:
+ *val = sc->vga_crtc.crtc_cursor_loc_low;
+ break;
+ case CRTC_VERT_RETRACE_START:
+ *val = sc->vga_crtc.crtc_vert_retrace_start;
+ break;
+ case CRTC_VERT_RETRACE_END:
+ *val = sc->vga_crtc.crtc_vert_retrace_end;
+ break;
+ case CRTC_VERT_DISP_END:
+ *val = sc->vga_crtc.crtc_vert_disp_end;
+ break;
+ case CRTC_OFFSET:
+ *val = sc->vga_crtc.crtc_offset;
+ break;
+ case CRTC_UNDERLINE_LOC:
+ *val = sc->vga_crtc.crtc_underline_loc;
+ break;
+ case CRTC_START_VERT_BLANK:
+ *val = sc->vga_crtc.crtc_start_vert_blank;
+ break;
+ case CRTC_END_VERT_BLANK:
+ *val = sc->vga_crtc.crtc_end_vert_blank;
+ break;
+ case CRTC_MODE_CONTROL:
+ *val = sc->vga_crtc.crtc_mode_ctrl;
+ break;
+ case CRTC_LINE_COMPARE:
+ *val = sc->vga_crtc.crtc_line_compare;
+ break;
+ default:
+ //printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case ATC_IDX_PORT:
+ *val = sc->vga_atc.atc_index;
+ break;
+ case ATC_DATA_PORT:
+ switch (sc->vga_atc.atc_index) {
+ case ATC_PALETTE0 ... ATC_PALETTE15:
+ *val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index];
+ break;
+ case ATC_MODE_CONTROL:
+ *val = sc->vga_atc.atc_mode;
+ break;
+ case ATC_OVERSCAN_COLOR:
+ *val = sc->vga_atc.atc_overscan_color;
+ break;
+ case ATC_COLOR_PLANE_ENABLE:
+ *val = sc->vga_atc.atc_color_plane_enb;
+ break;
+ case ATC_HORIZ_PIXEL_PANNING:
+ *val = sc->vga_atc.atc_horiz_pixel_panning;
+ break;
+ case ATC_COLOR_SELECT:
+ *val = sc->vga_atc.atc_color_select;
+ break;
+ default:
+ //printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case SEQ_IDX_PORT:
+ *val = sc->vga_seq.seq_index;
+ break;
+ case SEQ_DATA_PORT:
+ switch (sc->vga_seq.seq_index) {
+ case SEQ_RESET:
+ *val = sc->vga_seq.seq_reset;
+ break;
+ case SEQ_CLOCKING_MODE:
+ *val = sc->vga_seq.seq_clock_mode;
+ break;
+ case SEQ_MAP_MASK:
+ *val = sc->vga_seq.seq_map_mask;
+ break;
+ case SEQ_CHAR_MAP_SELECT:
+ *val = sc->vga_seq.seq_cmap_sel;
+ break;
+ case SEQ_MEMORY_MODE:
+ *val = sc->vga_seq.seq_mm;
+ break;
+ default:
+ //printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index);
+ assert(0);
+ break;
+ }
+ break;
+ case DAC_DATA_PORT:
+ *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index +
+ sc->vga_dac.dac_rd_subindex];
+ sc->vga_dac.dac_rd_subindex++;
+ if (sc->vga_dac.dac_rd_subindex == 3) {
+ sc->vga_dac.dac_rd_index++;
+ sc->vga_dac.dac_rd_subindex = 0;
+ }
+ break;
+ case GC_IDX_PORT:
+ *val = sc->vga_gc.gc_index;
+ break;
+ case GC_DATA_PORT:
+ switch (sc->vga_gc.gc_index) {
+ case GC_SET_RESET:
+ *val = sc->vga_gc.gc_set_reset;
+ break;
+ case GC_ENABLE_SET_RESET:
+ *val = sc->vga_gc.gc_enb_set_reset;
+ break;
+ case GC_COLOR_COMPARE:
+ *val = sc->vga_gc.gc_color_compare;
+ break;
+ case GC_DATA_ROTATE:
+ *val = sc->vga_gc.gc_rotate;
+ break;
+ case GC_READ_MAP_SELECT:
+ *val = sc->vga_gc.gc_read_map_sel;
+ break;
+ case GC_MODE:
+ *val = sc->vga_gc.gc_mode;
+ break;
+ case GC_MISCELLANEOUS:
+ *val = sc->vga_gc.gc_misc;
+ break;
+ case GC_COLOR_DONT_CARE:
+ *val = sc->vga_gc.gc_color_dont_care;
+ break;
+ case GC_BIT_MASK:
+ *val = sc->vga_gc.gc_bit_mask;
+ break;
+ default:
+ //printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case GEN_MISC_OUTPUT_PORT:
+ *val = sc->vga_misc;
+ break;
+ case GEN_INPUT_STS0_PORT:
+ assert(0);
+ break;
+ case GEN_INPUT_STS1_MONO_PORT:
+ case GEN_INPUT_STS1_COLOR_PORT:
+ sc->vga_atc.atc_flipflop = 0;
+#ifdef __FreeBSD__
+ sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE;
+ //sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+#else
+ /*
+ * During the bhyve bring-up process, a guest image was failing
+ * to successfully boot. It appeared to be spinning, waiting
+ * for this value to be toggled. Until it can be ruled out
+ * that this is unnecessary (and documentation seems to
+ * indicate that it should be present), the toggle should
+ * remain.
+ */
+ sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+#endif
+ *val = sc->vga_sts1;
+ break;
+ case GEN_FEATURE_CTRL_PORT:
+ // OpenBSD calls this with bytes = 1
+ //assert(0);
+ *val = 0;
+ break;
+ case 0x3c3:
+ *val = 0;
+ break;
+ default:
+ printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port);
+ //assert(0);
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
+ uint8_t val, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ switch (port) {
+ case CRTC_IDX_MONO_PORT:
+ case CRTC_IDX_COLOR_PORT:
+ sc->vga_crtc.crtc_index = val;
+ break;
+ case CRTC_DATA_MONO_PORT:
+ case CRTC_DATA_COLOR_PORT:
+ switch (sc->vga_crtc.crtc_index) {
+ case CRTC_HORIZ_TOTAL:
+ sc->vga_crtc.crtc_horiz_total = val;
+ break;
+ case CRTC_HORIZ_DISP_END:
+ sc->vga_crtc.crtc_horiz_disp_end = val;
+ break;
+ case CRTC_START_HORIZ_BLANK:
+ sc->vga_crtc.crtc_start_horiz_blank = val;
+ break;
+ case CRTC_END_HORIZ_BLANK:
+ sc->vga_crtc.crtc_end_horiz_blank = val;
+ break;
+ case CRTC_START_HORIZ_RETRACE:
+ sc->vga_crtc.crtc_start_horiz_retrace = val;
+ break;
+ case CRTC_END_HORIZ_RETRACE:
+ sc->vga_crtc.crtc_end_horiz_retrace = val;
+ break;
+ case CRTC_VERT_TOTAL:
+ sc->vga_crtc.crtc_vert_total = val;
+ break;
+ case CRTC_OVERFLOW:
+ sc->vga_crtc.crtc_overflow = val;
+ break;
+ case CRTC_PRESET_ROW_SCAN:
+ sc->vga_crtc.crtc_present_row_scan = val;
+ break;
+ case CRTC_MAX_SCAN_LINE:
+ sc->vga_crtc.crtc_max_scan_line = val;
+ break;
+ case CRTC_CURSOR_START:
+ sc->vga_crtc.crtc_cursor_start = val;
+ sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0;
+ break;
+ case CRTC_CURSOR_END:
+ sc->vga_crtc.crtc_cursor_end = val;
+ break;
+ case CRTC_START_ADDR_HIGH:
+ sc->vga_crtc.crtc_start_addr_high = val;
+ sc->vga_crtc.crtc_start_addr &= 0x00ff;
+ sc->vga_crtc.crtc_start_addr |= (val << 8);
+ break;
+ case CRTC_START_ADDR_LOW:
+ sc->vga_crtc.crtc_start_addr_low = val;
+ sc->vga_crtc.crtc_start_addr &= 0xff00;
+ sc->vga_crtc.crtc_start_addr |= (val & 0xff);
+ break;
+ case CRTC_CURSOR_LOC_HIGH:
+ sc->vga_crtc.crtc_cursor_loc_high = val;
+ sc->vga_crtc.crtc_cursor_loc &= 0x00ff;
+ sc->vga_crtc.crtc_cursor_loc |= (val << 8);
+ break;
+ case CRTC_CURSOR_LOC_LOW:
+ sc->vga_crtc.crtc_cursor_loc_low = val;
+ sc->vga_crtc.crtc_cursor_loc &= 0xff00;
+ sc->vga_crtc.crtc_cursor_loc |= (val & 0xff);
+ break;
+ case CRTC_VERT_RETRACE_START:
+ sc->vga_crtc.crtc_vert_retrace_start = val;
+ break;
+ case CRTC_VERT_RETRACE_END:
+ sc->vga_crtc.crtc_vert_retrace_end = val;
+ break;
+ case CRTC_VERT_DISP_END:
+ sc->vga_crtc.crtc_vert_disp_end = val;
+ break;
+ case CRTC_OFFSET:
+ sc->vga_crtc.crtc_offset = val;
+ break;
+ case CRTC_UNDERLINE_LOC:
+ sc->vga_crtc.crtc_underline_loc = val;
+ break;
+ case CRTC_START_VERT_BLANK:
+ sc->vga_crtc.crtc_start_vert_blank = val;
+ break;
+ case CRTC_END_VERT_BLANK:
+ sc->vga_crtc.crtc_end_vert_blank = val;
+ break;
+ case CRTC_MODE_CONTROL:
+ sc->vga_crtc.crtc_mode_ctrl = val;
+ break;
+ case CRTC_LINE_COMPARE:
+ sc->vga_crtc.crtc_line_compare = val;
+ break;
+ default:
+ //printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case ATC_IDX_PORT:
+ if (sc->vga_atc.atc_flipflop == 0) {
+ if (sc->vga_atc.atc_index & 0x20)
+ assert(0);
+ sc->vga_atc.atc_index = val & ATC_IDX_MASK;
+ } else {
+ switch (sc->vga_atc.atc_index) {
+ case ATC_PALETTE0 ... ATC_PALETTE15:
+ sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f;
+ break;
+ case ATC_MODE_CONTROL:
+ sc->vga_atc.atc_mode = val;
+ break;
+ case ATC_OVERSCAN_COLOR:
+ sc->vga_atc.atc_overscan_color = val;
+ break;
+ case ATC_COLOR_PLANE_ENABLE:
+ sc->vga_atc.atc_color_plane_enb = val;
+ break;
+ case ATC_HORIZ_PIXEL_PANNING:
+ sc->vga_atc.atc_horiz_pixel_panning = val;
+ break;
+ case ATC_COLOR_SELECT:
+ sc->vga_atc.atc_color_select = val;
+ sc->vga_atc.atc_color_select_45 =
+ (val & ATC_CS_C45) << 4;
+ sc->vga_atc.atc_color_select_67 =
+ ((val & ATC_CS_C67) >> 2) << 6;
+ break;
+ default:
+ //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index);
+ assert(0);
+ break;
+ }
+ }
+ sc->vga_atc.atc_flipflop ^= 1;
+ break;
+ case ATC_DATA_PORT:
+ break;
+ case SEQ_IDX_PORT:
+ sc->vga_seq.seq_index = val & 0x1f;
+ break;
+ case SEQ_DATA_PORT:
+ switch (sc->vga_seq.seq_index) {
+ case SEQ_RESET:
+ sc->vga_seq.seq_reset = val;
+ break;
+ case SEQ_CLOCKING_MODE:
+ sc->vga_seq.seq_clock_mode = val;
+ sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9;
+ break;
+ case SEQ_MAP_MASK:
+ sc->vga_seq.seq_map_mask = val;
+ break;
+ case SEQ_CHAR_MAP_SELECT:
+ sc->vga_seq.seq_cmap_sel = val;
+
+ sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB;
+ sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB;
+ break;
+ case SEQ_MEMORY_MODE:
+ sc->vga_seq.seq_mm = val;
+ /* Windows queries Chain4 */
+ //assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0);
+ break;
+ default:
+ //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index);
+ assert(0);
+ break;
+ }
+ break;
+ case DAC_MASK:
+ break;
+ case DAC_IDX_RD_PORT:
+ sc->vga_dac.dac_rd_index = val;
+ sc->vga_dac.dac_rd_subindex = 0;
+ break;
+ case DAC_IDX_WR_PORT:
+ sc->vga_dac.dac_wr_index = val;
+ sc->vga_dac.dac_wr_subindex = 0;
+ break;
+ case DAC_DATA_PORT:
+ sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index +
+ sc->vga_dac.dac_wr_subindex] = val;
+ sc->vga_dac.dac_wr_subindex++;
+ if (sc->vga_dac.dac_wr_subindex == 3) {
+ sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] =
+ ((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) |
+ (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) |
+ (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0));
+
+ sc->vga_dac.dac_wr_index++;
+ sc->vga_dac.dac_wr_subindex = 0;
+ }
+ break;
+ case GC_IDX_PORT:
+ sc->vga_gc.gc_index = val;
+ break;
+ case GC_DATA_PORT:
+ switch (sc->vga_gc.gc_index) {
+ case GC_SET_RESET:
+ sc->vga_gc.gc_set_reset = val;
+ break;
+ case GC_ENABLE_SET_RESET:
+ sc->vga_gc.gc_enb_set_reset = val;
+ break;
+ case GC_COLOR_COMPARE:
+ sc->vga_gc.gc_color_compare = val;
+ break;
+ case GC_DATA_ROTATE:
+ sc->vga_gc.gc_rotate = val;
+ sc->vga_gc.gc_op = (val >> 3) & 0x3;
+ break;
+ case GC_READ_MAP_SELECT:
+ sc->vga_gc.gc_read_map_sel = val;
+ break;
+ case GC_MODE:
+ sc->vga_gc.gc_mode = val;
+ sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0;
+ assert(!sc->vga_gc.gc_mode_c4);
+ sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0;
+ sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1;
+ sc->vga_gc.gc_mode_wm = val & 0x3;
+
+ if (sc->gc_image)
+ sc->gc_image->vgamode = 1;
+ break;
+ case GC_MISCELLANEOUS:
+ sc->vga_gc.gc_misc = val;
+ sc->vga_gc.gc_misc_gm = val & GC_MISC_GM;
+ sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >>
+ GC_MISC_MM_SHIFT;
+ break;
+ case GC_COLOR_DONT_CARE:
+ sc->vga_gc.gc_color_dont_care = val;
+ break;
+ case GC_BIT_MASK:
+ sc->vga_gc.gc_bit_mask = val;
+ break;
+ default:
+ //printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case GEN_INPUT_STS0_PORT:
+ /* write to Miscellaneous Output Register */
+ sc->vga_misc = val;
+ break;
+ case GEN_INPUT_STS1_MONO_PORT:
+ case GEN_INPUT_STS1_COLOR_PORT:
+ /* write to Feature Control Register */
+ break;
+// case 0x3c3:
+// break;
+ default:
+ printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val);
+ //assert(0);
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint8_t val;
+ int error;
+
+ switch (bytes) {
+ case 1:
+ if (in) {
+ *eax &= ~0xff;
+ error = vga_port_in_handler(ctx, in, port, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= val & 0xff;
+ }
+ } else {
+ val = *eax & 0xff;
+ error = vga_port_out_handler(ctx, in, port, 1,
+ val, arg);
+ }
+ break;
+ case 2:
+ if (in) {
+ *eax &= ~0xffff;
+ error = vga_port_in_handler(ctx, in, port, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= val & 0xff;
+ }
+ error = vga_port_in_handler(ctx, in, port + 1, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= (val & 0xff) << 8;
+ }
+ } else {
+ val = *eax & 0xff;
+ error = vga_port_out_handler(ctx, in, port, 1,
+ val, arg);
+ val = (*eax >> 8) & 0xff;
+ error =vga_port_out_handler(ctx, in, port + 1, 1,
+ val, arg);
+ }
+ break;
+ default:
+ assert(0);
+ return (-1);
+ }
+
+ return (error);
+}
+
+void *
+vga_init(int io_only)
+{
+ struct inout_port iop;
+ struct vga_softc *sc;
+ int port, error;
+
+ sc = calloc(1, sizeof(struct vga_softc));
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "VGA";
+ for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) {
+ iop.port = port;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = vga_port_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+ }
+
+ sc->gc_image = console_get_image();
+
+ /* only handle io ports; vga graphics is disabled */
+ if (io_only)
+ return(sc);
+
+ sc->mr.name = "VGA memory";
+ sc->mr.flags = MEM_F_RW;
+ sc->mr.base = 640 * KB;
+ sc->mr.size = 128 * KB;
+ sc->mr.handler = vga_mem_handler;
+ sc->mr.arg1 = sc;
+ error = register_mem_fallback(&sc->mr);
+ assert(error == 0);
+
+ sc->vga_ram = malloc(256 * KB);
+ memset(sc->vga_ram, 0, 256 * KB);
+
+ {
+ static uint8_t palette[] = {
+ 0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a,
+ 0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a,
+ 0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f,
+ 0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f,
+ };
+ int i;
+
+ memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t));
+ for (i = 0; i < 16; i++) {
+ sc->vga_dac.dac_palette_rgb[i] =
+ ((((sc->vga_dac.dac_palette[3*i + 0] << 2) |
+ ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) |
+ (((sc->vga_dac.dac_palette[3*i + 1] << 2) |
+ ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) |
+ (((sc->vga_dac.dac_palette[3*i + 2] << 2) |
+ ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0));
+ }
+ }
+
+ return (sc);
+}
diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h
new file mode 100644
index 0000000000..36c6dc15fa
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.h
@@ -0,0 +1,162 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VGA_H_
+#define _VGA_H_
+
+#define VGA_IOPORT_START 0x3c0
+#define VGA_IOPORT_END 0x3df
+
+/* General registers */
+#define GEN_INPUT_STS0_PORT 0x3c2
+#define GEN_FEATURE_CTRL_PORT 0x3ca
+#define GEN_MISC_OUTPUT_PORT 0x3cc
+#define GEN_INPUT_STS1_MONO_PORT 0x3ba
+#define GEN_INPUT_STS1_COLOR_PORT 0x3da
+#define GEN_IS1_VR 0x08 /* Vertical retrace */
+#define GEN_IS1_DE 0x01 /* Display enable not */
+
+/* Attribute controller registers. */
+#define ATC_IDX_PORT 0x3c0
+#define ATC_DATA_PORT 0x3c1
+
+#define ATC_IDX_MASK 0x1f
+#define ATC_PALETTE0 0
+#define ATC_PALETTE15 15
+#define ATC_MODE_CONTROL 16
+#define ATC_MC_IPS 0x80 /* Internal palette size */
+#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */
+#define ATC_OVERSCAN_COLOR 17
+#define ATC_COLOR_PLANE_ENABLE 18
+#define ATC_HORIZ_PIXEL_PANNING 19
+#define ATC_COLOR_SELECT 20
+#define ATC_CS_C67 0x0c /* Color select bits 6+7 */
+#define ATC_CS_C45 0x03 /* Color select bits 4+5 */
+
+/* Sequencer registers. */
+#define SEQ_IDX_PORT 0x3c4
+#define SEQ_DATA_PORT 0x3c5
+
+#define SEQ_RESET 0
+#define SEQ_RESET_ASYNC 0x1
+#define SEQ_RESET_SYNC 0x2
+#define SEQ_CLOCKING_MODE 1
+#define SEQ_CM_SO 0x20 /* Screen off */
+#define SEQ_CM_89 0x01 /* 8/9 dot clock */
+#define SEQ_MAP_MASK 2
+#define SEQ_CHAR_MAP_SELECT 3
+#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */
+#define SEQ_CMS_SAH_SHIFT 5
+#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */
+#define SEQ_CMS_SA_SHIFT 2
+#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */
+#define SEQ_CMS_SBH_SHIFT 4
+#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */
+#define SEQ_CMS_SB_SHIFT 0
+#define SEQ_MEMORY_MODE 4
+#define SEQ_MM_C4 0x08 /* Chain 4 */
+#define SEQ_MM_OE 0x04 /* Odd/even */
+#define SEQ_MM_EM 0x02 /* Extended memory */
+
+/* Graphics controller registers. */
+#define GC_IDX_PORT 0x3ce
+#define GC_DATA_PORT 0x3cf
+
+#define GC_SET_RESET 0
+#define GC_ENABLE_SET_RESET 1
+#define GC_COLOR_COMPARE 2
+#define GC_DATA_ROTATE 3
+#define GC_READ_MAP_SELECT 4
+#define GC_MODE 5
+#define GC_MODE_OE 0x10 /* Odd/even */
+#define GC_MODE_C4 0x04 /* Chain 4 */
+
+#define GC_MISCELLANEOUS 6
+#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */
+#define GC_MISC_MM 0x0c /* memory map */
+#define GC_MISC_MM_SHIFT 2
+#define GC_COLOR_DONT_CARE 7
+#define GC_BIT_MASK 8
+
+/* CRT controller registers. */
+#define CRTC_IDX_MONO_PORT 0x3b4
+#define CRTC_DATA_MONO_PORT 0x3b5
+#define CRTC_IDX_COLOR_PORT 0x3d4
+#define CRTC_DATA_COLOR_PORT 0x3d5
+
+#define CRTC_HORIZ_TOTAL 0
+#define CRTC_HORIZ_DISP_END 1
+#define CRTC_START_HORIZ_BLANK 2
+#define CRTC_END_HORIZ_BLANK 3
+#define CRTC_START_HORIZ_RETRACE 4
+#define CRTC_END_HORIZ_RETRACE 5
+#define CRTC_VERT_TOTAL 6
+#define CRTC_OVERFLOW 7
+#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */
+#define CRTC_OF_VRS9_SHIFT 7
+#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */
+#define CRTC_OF_VDE9_SHIFT 6
+#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */
+#define CRTC_OF_VRS8_SHIFT 2
+#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */
+#define CRTC_OF_VDE8_SHIFT 1
+#define CRTC_PRESET_ROW_SCAN 8
+#define CRTC_MAX_SCAN_LINE 9
+#define CRTC_MSL_MSL 0x1f
+#define CRTC_CURSOR_START 10
+#define CRTC_CS_CO 0x20 /* Cursor off */
+#define CRTC_CS_CS 0x1f /* Cursor start */
+#define CRTC_CURSOR_END 11
+#define CRTC_CE_CE 0x1f /* Cursor end */
+#define CRTC_START_ADDR_HIGH 12
+#define CRTC_START_ADDR_LOW 13
+#define CRTC_CURSOR_LOC_HIGH 14
+#define CRTC_CURSOR_LOC_LOW 15
+#define CRTC_VERT_RETRACE_START 16
+#define CRTC_VERT_RETRACE_END 17
+#define CRTC_VRE_MASK 0xf
+#define CRTC_VERT_DISP_END 18
+#define CRTC_OFFSET 19
+#define CRTC_UNDERLINE_LOC 20
+#define CRTC_START_VERT_BLANK 21
+#define CRTC_END_VERT_BLANK 22
+#define CRTC_MODE_CONTROL 23
+#define CRTC_MC_TE 0x80 /* Timing enable */
+#define CRTC_LINE_COMPARE 24
+
+/* DAC registers */
+#define DAC_MASK 0x3c6
+#define DAC_IDX_RD_PORT 0x3c7
+#define DAC_IDX_WR_PORT 0x3c8
+#define DAC_DATA_PORT 0x3c9
+
+void *vga_init(int io_only);
+
+#endif /* _VGA_H_ */
diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c
new file mode 100644
index 0000000000..d3ff5e3951
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.c
@@ -0,0 +1,794 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct pci_devinst *pi,
+ struct vqueue_info *queues)
+{
+ int i;
+
+ /* vs and dev_softc addresses must match */
+ assert((void *)vs == dev_softc);
+ vs->vs_vc = vc;
+ vs->vs_pi = pi;
+ pi->pi_arg = vs;
+
+ vs->vs_queues = queues;
+ for (i = 0; i < vc->vc_nvq; i++) {
+ queues[i].vq_vs = vs;
+ queues[i].vq_num = i;
+ }
+}
+
+/*
+ * Reset device (device-wide). This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ *
+ * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+ struct vqueue_info *vq;
+ int i, nvq;
+
+ if (vs->vs_mtx)
+ assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
+ nvq = vs->vs_vc->vc_nvq;
+ for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+ vq->vq_flags = 0;
+ vq->vq_last_avail = 0;
+ vq->vq_save_used = 0;
+ vq->vq_pfn = 0;
+ vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
+ }
+ vs->vs_negotiated_caps = 0;
+ vs->vs_curq = 0;
+ /* vs->vs_status = 0; -- redundant */
+ if (vs->vs_isr)
+ pci_lintr_deassert(vs->vs_pi);
+ vs->vs_isr = 0;
+ vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
+}
+
+/*
+ * Set I/O BAR (usually 0) to map PCI config registers.
+ */
+void
+vi_set_io_bar(struct virtio_softc *vs, int barnum)
+{
+ size_t size;
+
+ /*
+ * ??? should we use CFG0 if MSI-X is disabled?
+ * Existing code did not...
+ */
+ size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
+ pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
+}
+
+/*
+ * Initialize MSI-X vector capabilities if we're to use MSI-X,
+ * or MSI capabilities if not.
+ *
+ * We assume we want one MSI-X vector per queue, here, plus one
+ * for the config vec.
+ */
+int
+vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
+{
+ int nvec;
+
+ if (use_msix) {
+ vs->vs_flags |= VIRTIO_USE_MSIX;
+ VS_LOCK(vs);
+ vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+ VS_UNLOCK(vs);
+ nvec = vs->vs_vc->vc_nvq + 1;
+ if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
+ return (1);
+ } else
+ vs->vs_flags &= ~VIRTIO_USE_MSIX;
+
+ /* Only 1 MSI vector for bhyve */
+ pci_emul_add_msicap(vs->vs_pi, 1);
+
+ /* Legacy interrupts are mandatory for virtio devices */
+ pci_lintr_request(vs->vs_pi);
+
+ return (0);
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+void
+vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
+{
+ struct vqueue_info *vq;
+ uint64_t phys;
+ size_t size;
+ char *base;
+
+ vq = &vs->vs_queues[vs->vs_curq];
+ vq->vq_pfn = pfn;
+ phys = (uint64_t)pfn << VRING_PFN;
+ size = vring_size(vq->vq_qsize);
+ base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
+
+ /* First page(s) are descriptors... */
+ vq->vq_desc = (struct virtio_desc *)base;
+ base += vq->vq_qsize * sizeof(struct virtio_desc);
+
+ /* ... immediately followed by "avail" ring (entirely uint16_t's) */
+ vq->vq_avail = (struct vring_avail *)base;
+ base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+
+ /* Then it's rounded up to the next page... */
+ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
+
+ /* ... and the last page(s) are the used ring. */
+ vq->vq_used = (struct vring_used *)base;
+
+ /* Mark queue as allocated, and start at 0 when we use it. */
+ vq->vq_flags = VQ_ALLOC;
+ vq->vq_last_avail = 0;
+ vq->vq_save_used = 0;
+}
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
+ struct iovec *iov, int n_iov, uint16_t *flags) {
+
+ if (i >= n_iov)
+ return;
+ iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
+ iov[i].iov_len = vd->vd_len;
+ if (flags != NULL)
+ flags[i] = vd->vd_flags;
+}
+#define VQ_MAX_DESCRIPTORS 512 /* see below */
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request. If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request. This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1. Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more. We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the vd_flags and vd_next field of each
+ * descriptor and tells you how many are involved. Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size). If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If you want to verify the WRITE flag on each descriptor, pass a
+ * non-NULL "flags" pointer to an array of "uint16_t" of the same size
+ * as n_iov and we'll copy each vd_flags field after unwinding any
+ * indirects.
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1. If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
+ struct iovec *iov, int n_iov, uint16_t *flags)
+{
+ int i;
+ u_int ndesc, n_indir;
+ u_int idx, next;
+ volatile struct virtio_desc *vdir, *vindir, *vp;
+ struct vmctx *ctx;
+ struct virtio_softc *vs;
+ const char *name;
+
+ vs = vq->vq_vs;
+ name = vs->vs_vc->vc_name;
+
+ /*
+ * Note: it's the responsibility of the guest not to
+ * update vq->vq_avail->va_idx until all of the descriptors
+ * the guest has written are valid (including all their
+ * vd_next fields and vd_flags).
+ *
+ * Compute (last_avail - va_idx) in integers mod 2**16. This is
+ * the number of descriptors the device has made available
+ * since the last time we updated vq->vq_last_avail.
+ *
+ * We just need to do the subtraction as an unsigned int,
+ * then trim off excess bits.
+ */
+ idx = vq->vq_last_avail;
+ ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
+ if (ndesc == 0)
+ return (0);
+ if (ndesc > vq->vq_qsize) {
+ /* XXX need better way to diagnose issues */
+ fprintf(stderr,
+ "%s: ndesc (%u) out of range, driver confused?\r\n",
+ name, (u_int)ndesc);
+ return (-1);
+ }
+
+ /*
+ * Now count/parse "involved" descriptors starting from
+ * the head of the chain.
+ *
+ * To prevent loops, we could be more complicated and
+ * check whether we're re-visiting a previously visited
+ * index, but we just abort if the count gets excessive.
+ */
+ ctx = vs->vs_pi->pi_vmctx;
+ *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
+ vq->vq_last_avail++;
+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+ if (next >= vq->vq_qsize) {
+ fprintf(stderr,
+ "%s: descriptor index %u out of range, "
+ "driver confused?\r\n",
+ name, next);
+ return (-1);
+ }
+ vdir = &vq->vq_desc[next];
+ if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+ _vq_record(i, vdir, ctx, iov, n_iov, flags);
+ i++;
+ } else if ((vs->vs_vc->vc_hv_caps &
+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ fprintf(stderr,
+ "%s: descriptor has forbidden INDIRECT flag, "
+ "driver confused?\r\n",
+ name);
+ return (-1);
+ } else {
+ n_indir = vdir->vd_len / 16;
+ if ((vdir->vd_len & 0xf) || n_indir == 0) {
+ fprintf(stderr,
+ "%s: invalid indir len 0x%x, "
+ "driver confused?\r\n",
+ name, (u_int)vdir->vd_len);
+ return (-1);
+ }
+ vindir = paddr_guest2host(ctx,
+ vdir->vd_addr, vdir->vd_len);
+ /*
+ * Indirects start at the 0th, then follow
+ * their own embedded "next"s until those run
+ * out. Each one's indirect flag must be off
+ * (we don't really have to check, could just
+ * ignore errors...).
+ */
+ next = 0;
+ for (;;) {
+ vp = &vindir[next];
+ if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+ fprintf(stderr,
+ "%s: indirect desc has INDIR flag,"
+ " driver confused?\r\n",
+ name);
+ return (-1);
+ }
+ _vq_record(i, vp, ctx, iov, n_iov, flags);
+ if (++i > VQ_MAX_DESCRIPTORS)
+ goto loopy;
+ if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ next = vp->vd_next;
+ if (next >= n_indir) {
+ fprintf(stderr,
+ "%s: invalid next %u > %u, "
+ "driver confused?\r\n",
+ name, (u_int)next, n_indir);
+ return (-1);
+ }
+ }
+ }
+ if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
+ return (i);
+ }
+loopy:
+ fprintf(stderr,
+ "%s: descriptor loop? count > %d - driver confused?\r\n",
+ name, i);
+ return (-1);
+}
+
+/*
+ * Return the currently-first request chain back to the available queue.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_retchain(struct vqueue_info *vq)
+{
+
+ vq->vq_last_avail--;
+}
+
+/*
+ * Return specified request chain to the guest, setting its I/O length
+ * to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+ uint16_t uidx, mask;
+ volatile struct vring_used *vuh;
+ volatile struct virtio_used *vue;
+
+ /*
+ * Notes:
+ * - mask is N-1 where N is a power of 2 so computes x % N
+ * - vuh points to the "used" data shared with guest
+ * - vue points to the "used" ring entry we want to update
+ * - head is the same value we compute in vq_iovecs().
+ *
+ * (I apologize for the two fields named vu_idx; the
+ * virtio spec calls the one that vue points to, "id"...)
+ */
+ mask = vq->vq_qsize - 1;
+ vuh = vq->vq_used;
+
+ uidx = vuh->vu_idx;
+ vue = &vuh->vu_ring[uidx++ & mask];
+ vue->vu_idx = idx;
+ vue->vu_tlen = iolen;
+#ifndef __FreeBSD__
+ /*
+ * Ensure the used descriptor is visible before updating the index.
+ * This is necessary on ISAs with memory ordering less strict than x86.
+ */
+ wmb();
+#endif
+ vuh->vu_idx = uidx;
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one. If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt. Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point. (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+ struct virtio_softc *vs;
+ uint16_t event_idx, new_idx, old_idx;
+ int intr;
+
+ /*
+ * Interrupt generation: if we're using EVENT_IDX,
+ * interrupt if we've crossed the event threshold.
+ * Otherwise interrupt is generated if we added "used" entries,
+ * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+ *
+ * In any case, though, if NOTIFY_ON_EMPTY is set and the
+ * entire avail was processed, we need to interrupt always.
+ */
+ vs = vq->vq_vs;
+ old_idx = vq->vq_save_used;
+ vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
+#ifndef __FreeBSD__
+ /*
+ * Use full memory barrier between vu_idx store from preceding
+ * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
+ * va_flags below.
+ */
+ mb();
+#endif
+ if (used_all_avail &&
+ (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+ intr = 1;
+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
+ event_idx = VQ_USED_EVENT_IDX(vq);
+ /*
+ * This calculation is per docs and the kernel
+ * (see src/sys/dev/virtio/virtio_ring.h).
+ */
+ intr = (uint16_t)(new_idx - event_idx - 1) <
+ (uint16_t)(new_idx - old_idx);
+ } else {
+ intr = new_idx != old_idx &&
+ !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ if (intr)
+ vq_interrupt(vs, vq);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+ uint16_t cr_offset; /* register offset */
+ uint8_t cr_size; /* size (bytes) */
+ uint8_t cr_ro; /* true => reg is read only */
+ const char *cr_name; /* name of reg */
+} config_regs[] = {
+ { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" },
+ { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" },
+ { VTCFG_R_PFN, 4, 0, "PFN" },
+ { VTCFG_R_QNUM, 2, 1, "QNUM" },
+ { VTCFG_R_QSEL, 2, 0, "QSEL" },
+ { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" },
+ { VTCFG_R_STATUS, 1, 0, "STATUS" },
+ { VTCFG_R_ISR, 1, 0, "ISR" },
+ { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" },
+ { VTCFG_R_QVEC, 2, 0, "QVEC" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+ u_int hi, lo, mid;
+ struct config_reg *cr;
+
+ lo = 0;
+ hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+ while (hi >= lo) {
+ mid = (hi + lo) >> 1;
+ cr = &config_regs[mid];
+ if (cr->cr_offset == offset)
+ return (cr);
+ if (cr->cr_offset < offset)
+ lo = mid + 1;
+ else
+ hi = mid - 1;
+ }
+ return (NULL);
+}
+
+/*
+ * Handle pci config space reads.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+uint64_t
+vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct virtio_softc *vs = pi->pi_arg;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ uint64_t virtio_config_size, max;
+ const char *name;
+ uint32_t newoff;
+ uint32_t value;
+ int error;
+
+ if (vs->vs_flags & VIRTIO_USE_MSIX) {
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ return (pci_emul_msix_tread(pi, offset, size));
+ }
+ }
+
+ /* XXX probably should do something better than just assert() */
+ assert(baridx == 0);
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+ value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
+
+ if (size != 1 && size != 2 && size != 4)
+ goto bad;
+
+ if (pci_msix_enabled(pi))
+ virtio_config_size = VTCFG_R_CFG1;
+ else
+ virtio_config_size = VTCFG_R_CFG0;
+
+ if (offset >= virtio_config_size) {
+ /*
+ * Subtract off the standard size (including MSI-X
+ * registers if enabled) and dispatch to underlying driver.
+ * If that fails, fall into general code.
+ */
+ newoff = offset - virtio_config_size;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+ if (newoff + size > max)
+ goto bad;
+ error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL || cr->cr_size != size) {
+ if (cr != NULL) {
+ /* offset must be OK, so size must be bad */
+ fprintf(stderr,
+ "%s: read from %s: bad size %d\r\n",
+ name, cr->cr_name, size);
+ } else {
+ fprintf(stderr,
+ "%s: read from bad offset/size %jd/%d\r\n",
+ name, (uintmax_t)offset, size);
+ }
+ goto done;
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ value = vc->vc_hv_caps;
+ break;
+ case VTCFG_R_GUESTCAP:
+ value = vs->vs_negotiated_caps;
+ break;
+ case VTCFG_R_PFN:
+ if (vs->vs_curq < vc->vc_nvq)
+ value = vs->vs_queues[vs->vs_curq].vq_pfn;
+ break;
+ case VTCFG_R_QNUM:
+ value = vs->vs_curq < vc->vc_nvq ?
+ vs->vs_queues[vs->vs_curq].vq_qsize : 0;
+ break;
+ case VTCFG_R_QSEL:
+ value = vs->vs_curq;
+ break;
+ case VTCFG_R_QNOTIFY:
+ value = 0; /* XXX */
+ break;
+ case VTCFG_R_STATUS:
+ value = vs->vs_status;
+ break;
+ case VTCFG_R_ISR:
+ value = vs->vs_isr;
+ vs->vs_isr = 0; /* a read clears this flag */
+ if (value)
+ pci_lintr_deassert(pi);
+ break;
+ case VTCFG_R_CFGVEC:
+ value = vs->vs_msix_cfg_idx;
+ break;
+ case VTCFG_R_QVEC:
+ value = vs->vs_curq < vc->vc_nvq ?
+ vs->vs_queues[vs->vs_curq].vq_msix_idx :
+ VIRTIO_MSI_NO_VECTOR;
+ break;
+ }
+done:
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+ return (value);
+}
+
+/*
+ * Handle pci config space writes.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+void
+vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct virtio_softc *vs = pi->pi_arg;
+ struct vqueue_info *vq;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ uint64_t virtio_config_size, max;
+ const char *name;
+ uint32_t newoff;
+ int error;
+
+ if (vs->vs_flags & VIRTIO_USE_MSIX) {
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ pci_emul_msix_twrite(pi, offset, size, value);
+ return;
+ }
+ }
+
+ /* XXX probably should do something better than just assert() */
+ assert(baridx == 0);
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+
+ if (size != 1 && size != 2 && size != 4)
+ goto bad;
+
+ if (pci_msix_enabled(pi))
+ virtio_config_size = VTCFG_R_CFG1;
+ else
+ virtio_config_size = VTCFG_R_CFG0;
+
+ if (offset >= virtio_config_size) {
+ /*
+ * Subtract off the standard size (including MSI-X
+ * registers if enabled) and dispatch to underlying driver.
+ */
+ newoff = offset - virtio_config_size;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+ if (newoff + size > max)
+ goto bad;
+ error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
+ if (cr != NULL) {
+ /* offset must be OK, wrong size and/or reg is R/O */
+ if (cr->cr_size != size)
+ fprintf(stderr,
+ "%s: write to %s: bad size %d\r\n",
+ name, cr->cr_name, size);
+ if (cr->cr_ro)
+ fprintf(stderr,
+ "%s: write to read-only reg %s\r\n",
+ name, cr->cr_name);
+ } else {
+ fprintf(stderr,
+ "%s: write to bad offset/size %jd/%d\r\n",
+ name, (uintmax_t)offset, size);
+ }
+ goto done;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ vs->vs_negotiated_caps = value & vc->vc_hv_caps;
+ if (vc->vc_apply_features)
+ (*vc->vc_apply_features)(DEV_SOFTC(vs),
+ vs->vs_negotiated_caps);
+ break;
+ case VTCFG_R_PFN:
+ if (vs->vs_curq >= vc->vc_nvq)
+ goto bad_qindex;
+ vi_vq_init(vs, value);
+ break;
+ case VTCFG_R_QSEL:
+ /*
+ * Note that the guest is allowed to select an
+ * invalid queue; we just need to return a QNUM
+ * of 0 while the bad queue is selected.
+ */
+ vs->vs_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ if (value >= vc->vc_nvq) {
+ fprintf(stderr, "%s: queue %d notify out of range\r\n",
+ name, (int)value);
+ goto done;
+ }
+ vq = &vs->vs_queues[value];
+ if (vq->vq_notify)
+ (*vq->vq_notify)(DEV_SOFTC(vs), vq);
+ else if (vc->vc_qnotify)
+ (*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+ else
+ fprintf(stderr,
+ "%s: qnotify queue %d: missing vq/vc notify\r\n",
+ name, (int)value);
+ break;
+ case VTCFG_R_STATUS:
+ vs->vs_status = value;
+ if (value == 0)
+ (*vc->vc_reset)(DEV_SOFTC(vs));
+ break;
+ case VTCFG_R_CFGVEC:
+ vs->vs_msix_cfg_idx = value;
+ break;
+ case VTCFG_R_QVEC:
+ if (vs->vs_curq >= vc->vc_nvq)
+ goto bad_qindex;
+ vq = &vs->vs_queues[vs->vs_curq];
+ vq->vq_msix_idx = value;
+ break;
+ }
+ goto done;
+
+bad_qindex:
+ fprintf(stderr,
+ "%s: write config reg %s: curq %d >= max %d\r\n",
+ name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
+done:
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h
new file mode 100644
index 0000000000..a2c3362ec2
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.h
@@ -0,0 +1,484 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#include <pthread_np.h>
+
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ * https://github.com/rustyrussell/virtio-spec
+ * http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ * +-----------------------------------------------+
+ * | "desc": <N> descriptors, 16 bytes each |
+ * | ----------------------------------------- |
+ * | "avail": 2 uint16; <N> uint16; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ * | "used": 2 x uint16; <N> elems; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages. In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ * NEXT descriptor is chained, so use its "next" field
+ * WRITE descriptor is for host to write into guest RAM
+ * (else host is to read from guest RAM)
+ * INDIRECT descriptor address field is (guest physical)
+ * address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>. If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16). Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially). However, INDIRECT must not be set
+ * in the indirect descriptors. Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval). (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver. These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>. The <N> <ring> entries are simply indices
+ * indices into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value). However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below). The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation. Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+struct virtio_desc { /* AKA vring_desc */
+ uint64_t vd_addr; /* guest physical address */
+ uint32_t vd_len; /* length of scatter/gather seg */
+ uint16_t vd_flags; /* VRING_F_DESC_* */
+ uint16_t vd_next; /* next desc if F_NEXT */
+} __packed;
+
+struct virtio_used { /* AKA vring_used_elem */
+ uint32_t vu_idx; /* head of used descriptor chain */
+ uint32_t vu_tlen; /* length written-to */
+} __packed;
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct vring_avail {
+ uint16_t va_flags; /* VRING_AVAIL_F_* */
+ uint16_t va_idx; /* counts to 65535, then cycles */
+ uint16_t va_ring[]; /* size N, reported in QNUM value */
+/* uint16_t va_used_event; -- after N ring entries */
+} __packed;
+
+#define VRING_USED_F_NO_NOTIFY 1
+struct vring_used {
+ uint16_t vu_flags; /* VRING_USED_F_* */
+ uint16_t vu_idx; /* counts to 65535, then cycles */
+ struct virtio_used vu_ring[]; /* size N */
+/* uint16_t vu_avail_event; -- after N ring entries */
+} __packed;
+
+/*
+ * The address of any given virtual queue is determined by a single
+ * Page Frame Number register. The guest writes the PFN into the
+ * PCI config space. However, a device that has two or more
+ * virtqueues can have a different PFN, and size, for each queue.
+ * The number of queues is determinable via the PCI config space
+ * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means
+ * queue #0, 1 means queue#1, etc. Once a queue is selected, the
+ * remaining PFN and QNUM registers refer to that queue.
+ *
+ * QNUM is a read-only register containing a nonzero power of two
+ * that indicates the (hypervisor's) queue size. Or, if reading it
+ * produces zero, the hypervisor does not have a corresponding
+ * queue. (The number of possible queues depends on the virtual
+ * device. The block device has just one; the network device
+ * provides either two -- 0 = receive, 1 = transmit -- or three,
+ * with 2 = control.)
+ *
+ * PFN is a read/write register giving the physical page address of
+ * the virtqueue in guest memory (the guest must allocate enough space
+ * based on the hypervisor's provided QNUM).
+ *
+ * QNOTIFY is effectively write-only: when the guest writes a queue
+ * number to the register, the hypervisor should scan the specified
+ * virtqueue. (Reading QNOTIFY currently always gets 0).
+ */
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ *
+ * XXX Should really be merged with <dev/virtio/virtio.h> defines
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+#define VIRTIO_TYPE_CONSOLE 3
+#define VIRTIO_TYPE_ENTROPY 4
+#define VIRTIO_TYPE_BALLOON 5
+#define VIRTIO_TYPE_IOMEMORY 6
+#define VIRTIO_TYPE_RPMSG 7
+#define VIRTIO_TYPE_SCSI 8
+#define VIRTIO_TYPE_9P 9
+
+/* experimental IDs start at 65535 and work down */
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+#define VIRTIO_DEV_CONSOLE 0x1003
+#define VIRTIO_DEV_RANDOM 0x1005
+#define VIRTIO_DEV_SCSI 0x1008
+
+/*
+ * PCI config space constants.
+ *
+ * If MSI-X is enabled, the ISR register is generally not used,
+ * and the configuration vector and queue vector appear at offsets
+ * 20 and 22 with the remaining configuration registers at 24.
+ * If MSI-X is not enabled, those two registers disappear and
+ * the remaining configuration registers start at offset 20.
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFGVEC 20
+#define VTCFG_R_QVEC 22
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+/*
+ * Bits in VTCFG_R_STATUS. Guests need not actually set any of these,
+ * but a guest writing 0 to this register means "please reset".
+ */
+#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */
+#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */
+#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */
+#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */
+
+/*
+ * Bits in VTCFG_R_ISR. These apply only if not using MSI-X.
+ *
+ * (We don't [yet?] ever use CONF_CHANGED.)
+ */
+#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */
+#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */
+
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/*
+ * Feature flags.
+ * Note: bits 0 through 23 are reserved to each device type.
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
+#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
+#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
+
+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
+static inline size_t
+vring_size(u_int qsz)
+{
+ size_t size;
+
+ /* constant 3 below = va_flags, va_idx, va_used_event */
+ size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
+ size = roundup2(size, VRING_ALIGN);
+
+ /* constant 3 below = vu_flags, vu_idx, vu_avail_event */
+ size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
+ size = roundup2(size, VRING_ALIGN);
+
+ return (size);
+}
+
+struct vmctx;
+struct pci_devinst;
+struct vqueue_info;
+
+/*
+ * A virtual device, with some number (possibly 0) of virtual
+ * queues and some size (possibly 0) of configuration-space
+ * registers private to the device. The virtio_softc should come
+ * at the front of each "derived class", so that a pointer to the
+ * virtio_softc is also a pointer to the more specific, derived-
+ * from-virtio driver's softc.
+ *
+ * Note: inside each hypervisor virtio driver, changes to these
+ * data structures must be locked against other threads, if any.
+ * Except for PCI config space register read/write, we assume each
+ * driver does the required locking, but we need a pointer to the
+ * lock (if there is one) for PCI config space read/write ops.
+ *
+ * When the guest reads or writes the device's config space, the
+ * generic layer checks for operations on the special registers
+ * described above. If the offset of the register(s) being read
+ * or written is past the CFG area (CFG0 or CFG1), the request is
+ * passed on to the virtual device, after subtracting off the
+ * generic-layer size. (So, drivers can just use the offset as
+ * an offset into "struct config", for instance.)
+ *
+ * (The virtio layer also makes sure that the read or write is to/
+ * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
+ * However, the driver must verify the read or write size and offset
+ * and that no one is writing a readonly register.)
+ *
+ * The BROKED flag ("this thing done gone and broked") is for future
+ * use.
+ */
+#define VIRTIO_USE_MSIX 0x01
+#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */
+#define VIRTIO_BROKED 0x08 /* ??? */
+
+struct virtio_softc {
+ struct virtio_consts *vs_vc; /* constants (see below) */
+ int vs_flags; /* VIRTIO_* flags from above */
+ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
+ struct pci_devinst *vs_pi; /* PCI device instance */
+ uint32_t vs_negotiated_caps; /* negotiated capabilities */
+ struct vqueue_info *vs_queues; /* one per vc_nvq */
+ int vs_curq; /* current queue */
+ uint8_t vs_status; /* value from last status write */
+ uint8_t vs_isr; /* ISR flags, if not MSI-X */
+ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */
+};
+
+#define VS_LOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_lock(vs->vs_mtx); \
+} while (0)
+
+#define VS_UNLOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_unlock(vs->vs_mtx); \
+} while (0)
+
+struct virtio_consts {
+ const char *vc_name; /* name of driver (for diagnostics) */
+ int vc_nvq; /* number of virtual queues */
+ size_t vc_cfgsize; /* size of dev-specific config regs */
+ void (*vc_reset)(void *); /* called on virtual device reset */
+ void (*vc_qnotify)(void *, struct vqueue_info *);
+ /* called on QNOTIFY if no VQ notify */
+ int (*vc_cfgread)(void *, int, int, uint32_t *);
+ /* called to read config regs */
+ int (*vc_cfgwrite)(void *, int, int, uint32_t);
+ /* called to write config regs */
+ void (*vc_apply_features)(void *, uint64_t);
+ /* called to apply negotiated features */
+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset. When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one. The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define VQ_ALLOC 0x01 /* set once we have a pfn */
+#define VQ_BROKED 0x02 /* ??? */
+struct vqueue_info {
+ uint16_t vq_qsize; /* size of this queue (a power of 2) */
+ void (*vq_notify)(void *, struct vqueue_info *);
+ /* called instead of vc_notify, if not NULL */
+
+ struct virtio_softc *vq_vs; /* backpointer to softc */
+ uint16_t vq_num; /* we're the num'th queue in the softc */
+
+ uint16_t vq_flags; /* flags (see above) */
+ uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */
+ uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */
+ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
+
+ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */
+
+ volatile struct virtio_desc *vq_desc; /* descriptor array */
+ volatile struct vring_avail *vq_avail; /* the "avail" ring */
+ volatile struct vring_used *vq_used; /* the "used" ring */
+
+};
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+ (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+ ((vq)->vq_avail->va_ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+ return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors? (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+ return (vq_ring_ready(vq) && vq->vq_last_avail !=
+ vq->vq_avail->va_idx);
+}
+
+/*
+ * Deliver an interrupt to guest on the given virtual queue
+ * (if possible, or a generic MSI interrupt if not using MSI-X).
+ */
+static inline void
+vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
+{
+
+ if (pci_msix_enabled(vs->vs_pi))
+ pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
+ else {
+#ifndef __FreeBSD__
+ boolean_t unlock = B_FALSE;
+
+ if (vs->vs_mtx && !pthread_mutex_isowned_np(vs->vs_mtx)) {
+ unlock = B_TRUE;
+ pthread_mutex_lock(vs->vs_mtx);
+ }
+#else
+ VS_LOCK(vs);
+#endif
+ vs->vs_isr |= VTCFG_ISR_QUEUES;
+ pci_generate_msi(vs->vs_pi, 0);
+ pci_lintr_assert(vs->vs_pi);
+#ifndef __FreeBSD__
+ if (unlock)
+ pthread_mutex_unlock(vs->vs_mtx);
+#else
+ VS_UNLOCK(vs);
+#endif
+ }
+}
+
+struct iovec;
+void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct pci_devinst *pi,
+ struct vqueue_info *queues);
+int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void vi_reset_dev(struct virtio_softc *);
+void vi_set_io_bar(struct virtio_softc *, int);
+
+int vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
+ struct iovec *iov, int n_iov, uint16_t *flags);
+void vq_retchain(struct vqueue_info *vq);
+void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
+void vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size);
+void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value);
+#endif /* _VIRTIO_H_ */
diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c
new file mode 100644
index 0000000000..3278ea591c
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.c
@@ -0,0 +1,239 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/vmm.h>
+#include <machine/specialreg.h>
+
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xmsr.h"
+
+static int cpu_vendor_intel, cpu_vendor_amd;
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
+{
+
+ if (cpu_vendor_intel) {
+ switch (num) {
+#ifndef __FreeBSD__
+ case MSR_PERFCTR0:
+ case MSR_PERFCTR1:
+ case MSR_EVNTSEL0:
+ case MSR_EVNTSEL1:
+ return (0);
+#endif
+ case 0xd04: /* Sandy Bridge uncore PMCs */
+ case 0xc24:
+ return (0);
+ case MSR_BIOS_UPDT_TRIG:
+ return (0);
+ case MSR_BIOS_SIGN:
+ return (0);
+ default:
+ break;
+ }
+ } else if (cpu_vendor_amd) {
+ switch (num) {
+ case MSR_HWCR:
+ /*
+ * Ignore writes to hardware configuration MSR.
+ */
+ return (0);
+
+ case MSR_NB_CFG1:
+ case MSR_IC_CFG:
+ return (0); /* Ignore writes */
+
+ case MSR_PERFEVSEL0:
+ case MSR_PERFEVSEL1:
+ case MSR_PERFEVSEL2:
+ case MSR_PERFEVSEL3:
+ /* Ignore writes to the PerfEvtSel MSRs */
+ return (0);
+
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ /* Ignore writes to the PerfCtr MSRs */
+ return (0);
+
+ case MSR_P_STATE_CONTROL:
+ /* Ignore write to change the P-state */
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ return (-1);
+}
+
+int
+emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
+{
+ int error = 0;
+
+ if (cpu_vendor_intel) {
+ switch (num) {
+ case MSR_BIOS_SIGN:
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_PKG_ENERGY_STATUS:
+ case MSR_PP0_ENERGY_STATUS:
+ case MSR_PP1_ENERGY_STATUS:
+ case MSR_DRAM_ENERGY_STATUS:
+ *val = 0;
+ break;
+ case MSR_RAPL_POWER_UNIT:
+ /*
+ * Use the default value documented in section
+ * "RAPL Interfaces" in Intel SDM vol3.
+ */
+ *val = 0x000a1003;
+ break;
+ default:
+ error = -1;
+ break;
+ }
+ } else if (cpu_vendor_amd) {
+ switch (num) {
+ case MSR_BIOS_SIGN:
+ *val = 0;
+ break;
+ case MSR_HWCR:
+ /*
+ * Bios and Kernel Developer's Guides for AMD Families
+ * 12H, 14H, 15H and 16H.
+ */
+ *val = 0x01000010; /* Reset value */
+ *val |= 1 << 9; /* MONITOR/MWAIT disable */
+ break;
+
+ case MSR_NB_CFG1:
+ case MSR_IC_CFG:
+ /*
+ * The reset value is processor family dependent so
+ * just return 0.
+ */
+ *val = 0;
+ break;
+
+ case MSR_PERFEVSEL0:
+ case MSR_PERFEVSEL1:
+ case MSR_PERFEVSEL2:
+ case MSR_PERFEVSEL3:
+ /*
+ * PerfEvtSel MSRs are not properly virtualized so just
+ * return zero.
+ */
+ *val = 0;
+ break;
+
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ /*
+ * PerfCtr MSRs are not properly virtualized so just
+ * return zero.
+ */
+ *val = 0;
+ break;
+
+ case MSR_SMM_ADDR:
+ case MSR_SMM_MASK:
+ /*
+ * Return the reset value defined in the AMD Bios and
+ * Kernel Developer's Guide.
+ */
+ *val = 0;
+ break;
+
+ case MSR_P_STATE_LIMIT:
+ case MSR_P_STATE_CONTROL:
+ case MSR_P_STATE_STATUS:
+ case MSR_P_STATE_CONFIG(0): /* P0 configuration */
+ *val = 0;
+ break;
+
+ /*
+ * OpenBSD guests test bit 0 of this MSR to detect if the
+ * workaround for erratum 721 is already applied.
+ * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
+ */
+ case 0xC0011029:
+ *val = 1;
+ break;
+
+ default:
+ error = -1;
+ break;
+ }
+ } else {
+ error = -1;
+ }
+ return (error);
+}
+
+int
+init_msr(void)
+{
+ int error;
+ u_int regs[4];
+ char cpu_vendor[13];
+
+ do_cpuid(0, regs);
+ ((u_int *)&cpu_vendor)[0] = regs[1];
+ ((u_int *)&cpu_vendor)[1] = regs[3];
+ ((u_int *)&cpu_vendor)[2] = regs[2];
+ cpu_vendor[12] = '\0';
+
+ error = 0;
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+ cpu_vendor_amd = 1;
+ } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+ cpu_vendor_intel = 1;
+ } else {
+ fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+ error = -1;
+ }
+ return (error);
+}
diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h
new file mode 100644
index 0000000000..1fb47c3ae2
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int init_msr(void);
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
+
+#endif
diff --git a/usr/src/cmd/bhyve/zhyve.c b/usr/src/cmd/bhyve/zhyve.c
new file mode 100644
index 0000000000..d3e764b14d
--- /dev/null
+++ b/usr/src/cmd/bhyve/zhyve.c
@@ -0,0 +1,167 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+/*
+ * This small 'zhyve' stub is init for the zone: we therefore need to pick up
+ * our command-line arguments placed in ZHYVE_CMD_FILE by the boot stub, do a
+ * little administration, and exec the real bhyve binary.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/corectl.h>
+
+#define ZHYVE_CMD_FILE "/var/run/bhyve/zhyve.cmd"
+
+/*
+ * Do a read of the specified size or return an error. Returns 0 on success
+ * and -1 on error. Sets errno to EINVAL if EOF is encountered. For other
+ * errors, see read(2).
+ */
+static int
+full_read(int fd, char *buf, size_t len)
+{
+ ssize_t nread = 0;
+ size_t totread = 0;
+
+ while (totread < len) {
+ nread = read(fd, buf + totread, len - totread);
+ if (nread == 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (nread < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ }
+ return (-1);
+ }
+ totread += nread;
+ }
+ assert(totread == len);
+
+ return (0);
+}
+
+/*
+ * Reads the command line options from the packed nvlist in the file referenced
+ * by path. On success, 0 is returned and the members of *argv reference memory
+ * allocated from an nvlist. On failure, -1 is returned.
+ */
+
+static int
+parse_options_file(const char *path, uint_t *argcp, char ***argvp)
+{
+ int fd = -1;
+ struct stat stbuf;
+ char *buf = NULL;
+ nvlist_t *nvl = NULL;
+ int ret;
+
+ if ((fd = open(path, O_RDONLY)) < 0 ||
+ fstat(fd, &stbuf) != 0 ||
+ (buf = malloc(stbuf.st_size)) == NULL ||
+ full_read(fd, buf, stbuf.st_size) != 0 ||
+ nvlist_unpack(buf, stbuf.st_size, &nvl, 0) != 0 ||
+ nvlist_lookup_string_array(nvl, "bhyve_args", argvp, argcp) != 0) {
+ nvlist_free(nvl);
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+
+ free(buf);
+ (void) close(fd);
+
+ (void) printf("Configuration from %s:\n", path);
+ nvlist_print(stdout, nvl);
+
+ return (ret);
+}
+
+/*
+ * Setup to suppress core dumps within the zone.
+ */
+static void
+config_core_dumps()
+{
+ (void) core_set_options(0x0);
+}
+
+int
+main(int argc, char **argv)
+{
+ char **tmpargs;
+ uint_t zargc;
+ char **zargv;
+ int fd;
+
+ config_core_dumps();
+
+ fd = open("/dev/null", O_WRONLY);
+ assert(fd >= 0);
+ if (fd != STDIN_FILENO) {
+ (void) dup2(fd, STDIN_FILENO);
+ (void) close(fd);
+ }
+
+ fd = open("/dev/zfd/1", O_WRONLY);
+ assert(fd >= 0);
+ if (fd != STDOUT_FILENO) {
+ (void) dup2(fd, STDOUT_FILENO);
+ (void) close(fd);
+ }
+ setvbuf(stdout, NULL, _IONBF, 0);
+
+ fd = open("/dev/zfd/2", O_WRONLY);
+ assert(fd >= 0);
+ if (fd != STDERR_FILENO) {
+ (void) dup2(fd, STDERR_FILENO);
+ (void) close(fd);
+ }
+ setvbuf(stderr, NULL, _IONBF, 0);
+
+ if (parse_options_file(ZHYVE_CMD_FILE, &zargc, &zargv) != 0) {
+ (void) fprintf(stderr, "%s: failed to parse %s: %s\n",
+ argv[0], ZHYVE_CMD_FILE, strerror(errno));
+ return (EXIT_FAILURE);
+ }
+
+ /*
+ * Annoyingly, we need a NULL at the end.
+ */
+
+ if ((tmpargs = malloc(sizeof (*zargv) * (zargc + 1))) == NULL) {
+ perror("malloc failed");
+ return (EXIT_FAILURE);
+ }
+
+ memcpy(tmpargs, zargv, sizeof (*zargv) * zargc);
+ tmpargs[zargc] = NULL;
+
+ (void) execv("/usr/sbin/bhyve", tmpargs);
+
+ perror("execv failed");
+ return (EXIT_FAILURE);
+}