diff options
Diffstat (limited to 'usr/src/cmd/bhyve')
102 files changed, 40879 insertions, 0 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile new file mode 100644 index 0000000000..0ad066e6d4 --- /dev/null +++ b/usr/src/cmd/bhyve/Makefile @@ -0,0 +1,167 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2014 Pluribus Networks Inc. +# Copyright (c) 2018, Joyent, Inc. +# + +PROG = bhyve + +include ../Makefile.cmd +include ../Makefile.cmd.64 +include ../Makefile.ctf + +SUBDIRS = test + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +SRCS = acpi.c \ + atkbdc.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + bootrom.c \ + console.c \ + consport.c \ + dbgport.c \ + fwctl.c \ + gdb.c \ + inout.c \ + ioapic.c \ + mem.c \ + mevent.c \ + mptbl.c \ + pci_ahci.c \ + pci_e82545.c \ + pci_emul.c \ + pci_fbuf.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_nvme.c \ + pci_passthru.c \ + pci_uart.c \ + pci_virtio_block.c \ + pci_virtio_console.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_virtio_viona.c \ + pci_xhci.c \ + pm.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + sockstream.c \ + task_switch.c \ + uart_emul.c \ + usb_emul.c \ + usb_mouse.c \ + vga.c \ + virtio.c \ + vmm_instruction_emul.c \ + xmsr.c \ + spinup_ap.c \ + iov.c \ + bhyve_sol_glue.c + +# The virtio-scsi driver appears to include a slew of materials from FreeBSD's +# native SCSI implementation. We will omit that complexity for now. + #ctl_util.c \ + #ctl_scsi_all.c \ + #pci_virtio_scsi.c \ + + +OBJS = $(SRCS:.c=.o) + +CLOBBERFILES = $(ROOTUSRSBINPROG) $(ZHYVE) + +ZHYVE_DIR = $(ROOT)/usr/lib/brand/bhyve +ZHYVE_PROG = zhyve +ZHYVE = $(ZHYVE_DIR)/$(ZHYVE_PROG) + +MEVENT_TEST_PROG = mevent_test +MEVENT_TEST_SRCS = mevent.c mevent_test.c +MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o) + +CLEANFILES = $(PROG) $(ZHYVE_PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS) + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + -I$(SRC)/uts/common/io/e1000api \ + $(CPPFLAGS.master) \ + -I$(ROOT)/usr/platform/i86pc/include \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -I$(SRC)/lib/libdladm/common \ + -DWITHOUT_CAPSICUM + +# Disable the crypto code until it is wired up +CPPFLAGS += -DNO_OPENSSL + +pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign + +# Force c99 for everything +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz +$(ZHYVE_PROG) := LDLIBS += -lnvpair +$(MEVENT_TEST_PROG) := LDLIBS += -lsocket + +POST_PROCESS += ; $(GENSETDEFS) $@ + +.KEEP_STATE: + +all: $(PROG) $(MEVENT_TEST_PROG) $(ZHYVE_PROG) $(SUBDIRS) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +$(MEVENT_TEST_PROG): $(MEVENT_TEST_OBJS) + $(LINK.c) -o $@ $(MEVENT_TEST_OBJS) $(LDFLAGS) $(LDLIBS) + +install: all $(ZHYVE) $(ROOTUSRSBINPROG) $(SUBDIRS) + +clean: $(SUBDIRS) + $(RM) $(OBJS) $(CLEANFILES) + +clobber: clean $(SUBDIRS) + $(RM) $(CLOBBERFILES) + +lint: lint_SRCS $(SUBDIRS) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include ../Makefile.targ + +$(ZHYVE_DIR)/%: % + $(INS.file) + +%.o: $(SRC)/uts/i86pc/io/vmm/%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c new file mode 100644 index 0000000000..309ba98a11 --- /dev/null +++ b/usr/src/cmd/bhyve/acpi.c @@ -0,0 +1,983 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * bhyve ACPI table generator. + * + * Create the minimal set of ACPI tables required to boot FreeBSD (and + * hopefully other o/s's) by writing out ASL template files for each of + * the tables and the compiling them to AML with the Intel iasl compiler. + * The AML files are then read into guest memory. + * + * The tables are placed in the guest's ROM area just below 1MB physical, + * above the MPTable. + * + * Layout + * ------ + * RSDP -> 0xf2400 (36 bytes fixed) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) + * MADT -> 0xf2500 (depends on #CPUs) + * FADT -> 0xf2600 (268 bytes) + * HPET -> 0xf2740 (56 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) + * DSDT -> 0xf2800 (variable - can go up to 0x100000) + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/stat.h> + +#include <paths.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "acpi.h" +#include "pci_emul.h" + +/* + * Define the base address of the ACPI tables, and the offsets to + * the individual tables + */ +#define BHYVE_ACPI_BASE 0xf2400 +#define RSDT_OFFSET 0x040 +#define XSDT_OFFSET 0x080 +#define MADT_OFFSET 0x100 +#define FADT_OFFSET 0x200 +#define HPET_OFFSET 0x340 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 +#define DSDT_OFFSET 0x400 + +#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" +#define BHYVE_ASL_SUFFIX ".aml" +#define BHYVE_ASL_COMPILER "/usr/sbin/iasl" + +static int basl_keep_temps; +static int basl_verbose_iasl; +static int basl_ncpu; +static uint32_t basl_acpi_base = BHYVE_ACPI_BASE; +static uint32_t hpet_capabilities; + +/* + * Contains the full pathname of the template to be passed + * to mkstemp/mktemps(3) + */ +static char basl_template[MAXPATHLEN]; +static char basl_stemplate[MAXPATHLEN]; + +/* + * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). + */ +static FILE *dsdt_fp; +static int dsdt_indent_level; +static int dsdt_error; + +struct basl_fio { + int fd; + FILE *fp; + char f_name[MAXPATHLEN]; +}; + +#define EFPRINTF(...) \ + if (fprintf(__VA_ARGS__) < 0) goto err_exit; + +#define EFFLUSH(x) \ + if (fflush(x) != 0) goto err_exit; + +static int +basl_fwrite_rsdp(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDP template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 43\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 02\n"); + EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n", + basl_acpi_base + RSDT_OFFSET); + EFPRINTF(fp, "[0004]\t\tLength : 00000024\n"); + EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n", + basl_acpi_base + XSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_rsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_xsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve XSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_madt(FILE *fp) +{ + int i; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n"); + EFPRINTF(fp, "\n"); + + /* Add a Processor Local APIC entry for each CPU */ + for (i = 0; i < basl_ncpu; i++) { + EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n"); + EFPRINTF(fp, "[0001]\t\tLength : 08\n"); + /* iasl expects hex values for the proc and apic id's */ + EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i); + EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n"); + EFPRINTF(fp, "\n"); + } + + /* Always a single IOAPIC entry, with ID 0 */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0C\n"); + /* iasl expects a hex value for the i/o apic id */ + EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : 00\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT); + EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tPolarity : 3\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n"); + EFPRINTF(fp, "\n"); + + /* Local APIC NMI is connected to LINT 1 on all CPUs */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n"); + EFPRINTF(fp, "[0001]\t\tLength : 06\n"); + EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_fadt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 05\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tModel : 01\n"); + EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n"); + EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n", + SCI_INT); + EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n", + SMI_CMD); + EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n", + BHYVE_ACPI_ENABLE); + EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n", + BHYVE_ACPI_DISABLE); + EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n"); + EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n", + IO_PMTMR); + EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n"); + EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n"); + EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n"); + EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n"); + EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n"); + EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n"); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n"); + EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n"); + EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n"); + EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n"); + EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tReset Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n"); + EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n"); + EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n"); + EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n"); + EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n"); + EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, + "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 10\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + /* Valid for bhyve */ + EFPRINTF(fp, + "[0012]\t\tPM Timer Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + IO_PMTMR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Control Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Status Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_hpet(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve HPET template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities); + EFPRINTF(fp, + "[0012]\t\tTimer Block Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n"); + EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n"); + EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_mcfg(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int +basl_fwrite_facs(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FACS template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n"); + EFPRINTF(fp, "[0004]\t\tLength : 00000040\n"); + EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n"); + EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n"); + EFPRINTF(fp, + "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n"); + EFPRINTF(fp, "[0001]\t\tVersion : 02\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +/* + * Helper routines for writing to the DSDT from other modules. + */ +void +dsdt_line(const char *fmt, ...) +{ + va_list ap; + + if (dsdt_error != 0) + return; + + if (strcmp(fmt, "") != 0) { + if (dsdt_indent_level != 0) + EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); + va_start(ap, fmt); + if (vfprintf(dsdt_fp, fmt, ap) < 0) { + va_end(ap); + goto err_exit; + } + va_end(ap); + } + EFPRINTF(dsdt_fp, "\n"); + return; + +err_exit: + dsdt_error = errno; +} + +void +dsdt_indent(int levels) +{ + + dsdt_indent_level += levels; + assert(dsdt_indent_level >= 0); +} + +void +dsdt_unindent(int levels) +{ + + assert(dsdt_indent_level >= levels); + dsdt_indent_level -= levels; +} + +void +dsdt_fixed_ioport(uint16_t iobase, uint16_t length) +{ + + dsdt_line("IO (Decode16,"); + dsdt_line(" 0x%04X, // Range Minimum", iobase); + dsdt_line(" 0x%04X, // Range Maximum", iobase); + dsdt_line(" 0x01, // Alignment"); + dsdt_line(" 0x%02X, // Length", length); + dsdt_line(" )"); +} + +void +dsdt_fixed_irq(uint8_t irq) +{ + + dsdt_line("IRQNoFlags ()"); + dsdt_line(" {%d}", irq); +} + +void +dsdt_fixed_mem32(uint32_t base, uint32_t length) +{ + + dsdt_line("Memory32Fixed (ReadWrite,"); + dsdt_line(" 0x%08X, // Address Base", base); + dsdt_line(" 0x%08X, // Address Length", length); + dsdt_line(" )"); +} + +static int +basl_fwrite_dsdt(FILE *fp) +{ + dsdt_fp = fp; + dsdt_error = 0; + dsdt_indent_level = 0; + + dsdt_line("/*"); + dsdt_line(" * bhyve DSDT template"); + dsdt_line(" */"); + dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," + "\"BHYVE \", \"BVDSDT \", 0x00000001)"); + dsdt_line("{"); + dsdt_line(" Name (_S5, Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x05,"); + dsdt_line(" Zero,"); + dsdt_line(" })"); + + pci_write_dsdt(); + + dsdt_line(""); + dsdt_line(" Scope (_SB.PC00)"); + dsdt_line(" {"); + dsdt_line(" Device (HPET)"); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); + dsdt_line(" Name (_UID, 0)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + dsdt_fixed_mem32(0xFED00000, 0x400); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line("}"); + + if (dsdt_error != 0) + return (dsdt_error); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_open(struct basl_fio *bf, int suffix) +{ + int err; + + err = 0; + + if (suffix) { + strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN); + bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); + } else { + strlcpy(bf->f_name, basl_template, MAXPATHLEN); + bf->fd = mkstemp(bf->f_name); + } + + if (bf->fd > 0) { + bf->fp = fdopen(bf->fd, "w+"); + if (bf->fp == NULL) { + unlink(bf->f_name); + close(bf->fd); + } + } else { + err = 1; + } + + return (err); +} + +static void +basl_close(struct basl_fio *bf) +{ + + if (!basl_keep_temps) + unlink(bf->f_name); + fclose(bf->fp); +} + +static int +basl_start(struct basl_fio *in, struct basl_fio *out) +{ + int err; + + err = basl_open(in, 0); + if (!err) { + err = basl_open(out, 1); + if (err) { + basl_close(in); + } + } + + return (err); +} + +static void +basl_end(struct basl_fio *in, struct basl_fio *out) +{ + + basl_close(in); + basl_close(out); +} + +static int +basl_load(struct vmctx *ctx, int fd, uint64_t off) +{ + struct stat sb; + void *gaddr; + + if (fstat(fd, &sb) < 0) + return (errno); + + gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size); + if (gaddr == NULL) + return (EFAULT); + + if (read(fd, gaddr, sb.st_size) < 0) + return (errno); + + return (0); +} + +static int +basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset) +{ + struct basl_fio io[2]; + static char iaslbuf[3*MAXPATHLEN + 10]; + char *fmt; + int err; + + err = basl_start(&io[0], &io[1]); + if (!err) { + err = (*fwrite_section)(io[0].fp); + + if (!err) { + /* + * iasl sends the results of the compilation to + * stdout. Shut this down by using the shell to + * redirect stdout to /dev/null, unless the user + * has requested verbose output for debugging + * purposes + */ + fmt = basl_verbose_iasl ? + "%s -p %s %s" : + "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; + + snprintf(iaslbuf, sizeof(iaslbuf), + fmt, + BHYVE_ASL_COMPILER, + io[1].f_name, io[0].f_name); + err = system(iaslbuf); + + if (!err) { + /* + * Copy the aml output file into guest + * memory at the specified location + */ + err = basl_load(ctx, io[1].fd, offset); + } + } + basl_end(&io[0], &io[1]); + } + + return (err); +} + +static int +basl_make_templates(void) +{ + const char *tmpdir; + int err; + int len; + + err = 0; + + /* + * + */ + if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || + (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { + tmpdir = _PATH_TMP; + } + + len = strlen(tmpdir); + + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { + strcpy(basl_template, tmpdir); + while (len > 0 && basl_template[len - 1] == '/') + len--; + basl_template[len] = '/'; + strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); + } else + err = E2BIG; + + if (!err) { + /* + * len has been intialized (and maybe adjusted) above + */ + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { + strcpy(basl_stemplate, tmpdir); + basl_stemplate[len] = '/'; + strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); + len = strlen(basl_stemplate); + strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); + } else + err = E2BIG; + } + + return (err); +} + +static struct { + int (*wsect)(FILE *fp); + uint64_t offset; +} basl_ftables[] = +{ + { basl_fwrite_rsdp, 0}, + { basl_fwrite_rsdt, RSDT_OFFSET }, + { basl_fwrite_xsdt, XSDT_OFFSET }, + { basl_fwrite_madt, MADT_OFFSET }, + { basl_fwrite_fadt, FADT_OFFSET }, + { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, + { basl_fwrite_facs, FACS_OFFSET }, + { basl_fwrite_dsdt, DSDT_OFFSET }, + { NULL } +}; + +int +acpi_build(struct vmctx *ctx, int ncpu) +{ + int err; + int i; + + basl_ncpu = ncpu; + + err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); + if (err != 0) + return (err); + + /* + * For debug, allow the user to have iasl compiler output sent + * to stdout rather than /dev/null + */ + if (getenv("BHYVE_ACPI_VERBOSE_IASL")) + basl_verbose_iasl = 1; + + /* + * Allow the user to keep the generated ASL files for debugging + * instead of deleting them following use + */ + if (getenv("BHYVE_ACPI_KEEPTMPS")) + basl_keep_temps = 1; + + i = 0; + err = basl_make_templates(); + + /* + * Run through all the ASL files, compiling them and + * copying them into guest memory + */ + while (!err && basl_ftables[i].wsect != NULL) { + err = basl_compile(ctx, basl_ftables[i].wsect, + basl_ftables[i].offset); + i++; + } + + return (err); +} diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h new file mode 100644 index 0000000000..4c6d86d091 --- /dev/null +++ b/usr/src/cmd/bhyve/acpi.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ACPI_H_ +#define _ACPI_H_ + +#define SCI_INT 9 + +#define SMI_CMD 0xb2 +#define BHYVE_ACPI_ENABLE 0xa0 +#define BHYVE_ACPI_DISABLE 0xa1 + +#define PM1A_EVT_ADDR 0x400 +#define PM1A_CNT_ADDR 0x404 + +#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */ + +struct vmctx; + +int acpi_build(struct vmctx *ctx, int ncpu); +void dsdt_line(const char *fmt, ...); +void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); +void dsdt_fixed_irq(uint8_t irq); +void dsdt_fixed_mem32(uint32_t base, uint32_t length); +void dsdt_indent(int levels); +void dsdt_unindent(int levels); +void sci_init(struct vmctx *ctx); + +#endif /* _ACPI_H_ */ diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h new file mode 100644 index 0000000000..691d4bd438 --- /dev/null +++ b/usr/src/cmd/bhyve/ahci.h @@ -0,0 +1,324 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org> + * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AHCI_H_ +#define _AHCI_H_ + +/* ATA register defines */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ + +/* SATA register defines */ +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 + +/* SATA AHCI v1.0 register defines */ +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 + +/* Just to be sure, if building as module. */ +#if MAXPHYS < 512 * 1024 +#undef MAXPHYS +#define MAXPHYS 512 * 1024 +#endif +/* Pessimistic prognosis on number of required S/G entries */ +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +/* Command list. 32 commands. First, 1Kbyte aligned. */ +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 +/* Command tables. Up to 32 commands, Each, 128byte aligned. */ +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +/* Total main work area. */ +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) + +#endif /* _AHCI_H_ */ diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c new file mode 100644 index 0000000000..1c1838c2e8 --- /dev/null +++ b/usr/src/cmd/bhyve/atkbdc.c @@ -0,0 +1,586 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <machine/vmm.h> + +#include <vmmapi.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "acpi.h" +#include "atkbdc.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "ps2kbd.h" +#include "ps2mouse.h" + +#define KBD_DATA_PORT 0x60 + +#define KBD_STS_CTL_PORT 0x64 + +#define KBDC_RESET 0xfe + +#define KBD_DEV_IRQ 1 +#define AUX_DEV_IRQ 12 + +/* controller commands */ +#define KBDC_SET_COMMAND_BYTE 0x60 +#define KBDC_GET_COMMAND_BYTE 0x20 +#define KBDC_DISABLE_AUX_PORT 0xa7 +#define KBDC_ENABLE_AUX_PORT 0xa8 +#define KBDC_TEST_AUX_PORT 0xa9 +#define KBDC_TEST_CTRL 0xaa +#define KBDC_TEST_KBD_PORT 0xab +#define KBDC_DISABLE_KBD_PORT 0xad +#define KBDC_ENABLE_KBD_PORT 0xae +#define KBDC_READ_INPORT 0xc0 +#define KBDC_READ_OUTPORT 0xd0 +#define KBDC_WRITE_OUTPORT 0xd1 +#define KBDC_WRITE_KBD_OUTBUF 0xd2 +#define KBDC_WRITE_AUX_OUTBUF 0xd3 +#define KBDC_WRITE_TO_AUX 0xd4 + +/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */ +#define KBD_TRANSLATION 0x40 +#define KBD_SYS_FLAG_BIT 0x04 +#define KBD_DISABLE_KBD_PORT 0x10 +#define KBD_DISABLE_AUX_PORT 0x20 +#define KBD_ENABLE_AUX_INT 0x02 +#define KBD_ENABLE_KBD_INT 0x01 +#define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT) +#define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT) + +/* controller status bits */ +#define KBDS_KBD_BUFFER_FULL 0x01 +#define KBDS_SYS_FLAG 0x04 +#define KBDS_CTRL_FLAG 0x08 +#define KBDS_AUX_BUFFER_FULL 0x20 + +/* controller output port */ +#define KBDO_KBD_OUTFULL 0x10 +#define KBDO_AUX_OUTFULL 0x20 + +#define RAMSZ 32 +#define FIFOSZ 15 +#define CTRL_CMD_FLAG 0x8000 + +struct kbd_dev { + bool irq_active; + int irq; + + uint8_t buffer[FIFOSZ]; + int brd, bwr; + int bcnt; +}; + +struct aux_dev { + bool irq_active; + int irq; +}; + +struct atkbdc_softc { + struct vmctx *ctx; + pthread_mutex_t mtx; + + struct ps2kbd_softc *ps2kbd_sc; + struct ps2mouse_softc *ps2mouse_sc; + + uint8_t status; /* status register */ + uint8_t outport; /* controller output port */ + uint8_t ram[RAMSZ]; /* byte0 = controller config */ + + uint32_t curcmd; /* current command for next byte */ + uint32_t ctrlbyte; + + struct kbd_dev kbd; + struct aux_dev aux; +}; + +static void +atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) +{ + if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { + sc->kbd.irq_active = true; + vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); + } +} + +static void +atkbdc_assert_aux_intr(struct atkbdc_softc *sc) +{ + if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { + sc->aux.irq_active = true; + vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq); + } +} + +static int +atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (sc->kbd.bcnt < FIFOSZ) { + sc->kbd.buffer[sc->kbd.bwr] = val; + sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ; + sc->kbd.bcnt++; + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_KBD_OUTFULL; + } else { + printf("atkbd data buffer full\n"); + } + + return (sc->kbd.bcnt < FIFOSZ); +} + +static void +atkbdc_kbd_read(struct atkbdc_softc *sc) +{ + const uint8_t translation[256] = { + 0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58, + 0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59, + 0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a, + 0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b, + 0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c, + 0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d, + 0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e, + 0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f, + 0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60, + 0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61, + 0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e, + 0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76, + 0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b, + 0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f, + 0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45, + 0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54, + 0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff + }; + uint8_t val; + uint8_t release = 0; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (sc->ram[0] & KBD_TRANSLATION) { + while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) { + if (val == 0xf0) { + release = 0x80; + continue; + } else { + val = translation[val] | release; + } + atkbdc_kbd_queue_data(sc, val); + break; + } + } else { + while (sc->kbd.bcnt < FIFOSZ) { + if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) + atkbdc_kbd_queue_data(sc, val); + else + break; + } + } + + if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) || + ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0) + atkbdc_assert_kbd_intr(sc); +} + +static void +atkbdc_aux_poll(struct atkbdc_softc *sc) +{ + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_AUX_OUTFULL; + atkbdc_assert_aux_intr(sc); + } +} + +static void +atkbdc_kbd_poll(struct atkbdc_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + atkbdc_kbd_read(sc); +} + +static void +atkbdc_poll(struct atkbdc_softc *sc) +{ + atkbdc_aux_poll(sc); + atkbdc_kbd_poll(sc); +} + +static void +atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) { + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) { + if (sc->kbd.bcnt == 0) + sc->status &= ~(KBDS_AUX_BUFFER_FULL | + KBDS_KBD_BUFFER_FULL); + else + sc->status &= ~(KBDS_AUX_BUFFER_FULL); + sc->outport &= ~KBDO_AUX_OUTFULL; + } + + atkbdc_poll(sc); + return; + } + + if (sc->kbd.bcnt > 0) { + *buf = sc->kbd.buffer[sc->kbd.brd]; + sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ; + sc->kbd.bcnt--; + if (sc->kbd.bcnt == 0) { + sc->status &= ~KBDS_KBD_BUFFER_FULL; + sc->outport &= ~KBDO_KBD_OUTFULL; + } + + atkbdc_poll(sc); + } + + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) { + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + } +} + +static int +atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct atkbdc_softc *sc; + uint8_t buf; + int retval; + + if (bytes != 1) + return (-1); + sc = arg; + retval = 0; + + pthread_mutex_lock(&sc->mtx); + if (in) { + sc->curcmd = 0; + if (sc->ctrlbyte != 0) { + *eax = sc->ctrlbyte & 0xff; + sc->ctrlbyte = 0; + } else { + /* read device buffer; includes kbd cmd responses */ + atkbdc_dequeue_data(sc, &buf); + *eax = buf; + } + + sc->status &= ~KBDS_CTRL_FLAG; + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + if (sc->status & KBDS_CTRL_FLAG) { + /* + * Command byte for the controller. + */ + switch (sc->curcmd) { + case KBDC_SET_COMMAND_BYTE: + sc->ram[0] = *eax; + if (sc->ram[0] & KBD_SYS_FLAG_BIT) + sc->status |= KBDS_SYS_FLAG; + else + sc->status &= ~KBDS_SYS_FLAG; + break; + case KBDC_WRITE_OUTPORT: + sc->outport = *eax; + break; + case KBDC_WRITE_TO_AUX: + ps2mouse_write(sc->ps2mouse_sc, *eax, 0); + atkbdc_poll(sc); + break; + case KBDC_WRITE_KBD_OUTBUF: + atkbdc_kbd_queue_data(sc, *eax); + break; + case KBDC_WRITE_AUX_OUTBUF: + ps2mouse_write(sc->ps2mouse_sc, *eax, 1); + sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + atkbdc_aux_poll(sc); + break; + default: + /* write to particular RAM byte */ + if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) { + int byten; + + byten = (sc->curcmd - 0x60) & 0x1f; + sc->ram[byten] = *eax & 0xff; + } + break; + } + + sc->curcmd = 0; + sc->status &= ~KBDS_CTRL_FLAG; + + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + /* + * Data byte for the device. + */ + ps2kbd_write(sc->ps2kbd_sc, *eax); + atkbdc_poll(sc); + + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +static int +atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + struct atkbdc_softc *sc; + int error, retval; + + if (bytes != 1) + return (-1); + + sc = arg; + retval = 0; + + pthread_mutex_lock(&sc->mtx); + + if (in) { + /* read status register */ + *eax = sc->status; + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + + sc->curcmd = 0; + sc->status |= KBDS_CTRL_FLAG; + sc->ctrlbyte = 0; + + switch (*eax) { + case KBDC_GET_COMMAND_BYTE: + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0]; + break; + case KBDC_TEST_CTRL: + sc->ctrlbyte = CTRL_CMD_FLAG | 0x55; + break; + case KBDC_TEST_AUX_PORT: + case KBDC_TEST_KBD_PORT: + sc->ctrlbyte = CTRL_CMD_FLAG | 0; + break; + case KBDC_READ_INPORT: + sc->ctrlbyte = CTRL_CMD_FLAG | 0; + break; + case KBDC_READ_OUTPORT: + sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport; + break; + case KBDC_SET_COMMAND_BYTE: + case KBDC_WRITE_OUTPORT: + case KBDC_WRITE_KBD_OUTBUF: + case KBDC_WRITE_AUX_OUTBUF: + sc->curcmd = *eax; + break; + case KBDC_DISABLE_KBD_PORT: + sc->ram[0] |= KBD_DISABLE_KBD_PORT; + break; + case KBDC_ENABLE_KBD_PORT: + sc->ram[0] &= ~KBD_DISABLE_KBD_PORT; + if (sc->kbd.bcnt > 0) + sc->status |= KBDS_KBD_BUFFER_FULL; + atkbdc_poll(sc); + break; + case KBDC_WRITE_TO_AUX: + sc->curcmd = *eax; + break; + case KBDC_DISABLE_AUX_PORT: + sc->ram[0] |= KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 0); + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + sc->outport &= ~KBDS_AUX_BUFFER_FULL; + break; + case KBDC_ENABLE_AUX_PORT: + sc->ram[0] &= ~KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 1); + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + break; + case KBDC_RESET: /* Pulse "reset" line */ + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + break; + default: + if (*eax >= 0x21 && *eax <= 0x3f) { + /* read "byte N" from RAM */ + int byten; + + byten = (*eax - 0x20) & 0x1f; + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten]; + } + break; + } + + pthread_mutex_unlock(&sc->mtx); + + if (sc->ctrlbyte != 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->status &= ~KBDS_AUX_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 && + (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + atkbdc_assert_aux_intr(sc); + } else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } + + return (retval); +} + +void +atkbdc_event(struct atkbdc_softc *sc, int iskbd) +{ + pthread_mutex_lock(&sc->mtx); + + if (iskbd) + atkbdc_kbd_poll(sc); + else + atkbdc_aux_poll(sc); + pthread_mutex_unlock(&sc->mtx); +} + +void +atkbdc_init(struct vmctx *ctx) +{ + struct inout_port iop; + struct atkbdc_softc *sc; + int error; + + sc = calloc(1, sizeof(struct atkbdc_softc)); + sc->ctx = ctx; + + pthread_mutex_init(&sc->mtx, NULL); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "atkdbc"; + iop.port = KBD_STS_CTL_PORT; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = atkbdc_sts_ctl_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "atkdbc"; + iop.port = KBD_DATA_PORT; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = atkbdc_data_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + + pci_irq_reserve(KBD_DEV_IRQ); + sc->kbd.irq = KBD_DEV_IRQ; + + pci_irq_reserve(AUX_DEV_IRQ); + sc->aux.irq = AUX_DEV_IRQ; + + sc->ps2kbd_sc = ps2kbd_init(sc); + sc->ps2mouse_sc = ps2mouse_init(sc); +} + +static void +atkbdc_dsdt(void) +{ + + dsdt_line(""); + dsdt_line("Device (KBD)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(KBD_DATA_PORT, 1); + dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); + dsdt_fixed_irq(1); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (MOU)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(KBD_DATA_PORT, 1); + dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); + dsdt_fixed_irq(12); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(atkbdc_dsdt); + diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h new file mode 100644 index 0000000000..85c8a7141e --- /dev/null +++ b/usr/src/cmd/bhyve/atkbdc.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ATKBDC_H_ +#define _ATKBDC_H_ + +struct atkbdc_softc; +struct vmctx; + +void atkbdc_init(struct vmctx *ctx); +void atkbdc_event(struct atkbdc_softc *sc, int iskbd); + +#endif /* _ATKBDC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c new file mode 100644 index 0000000000..7b24ea7f5d --- /dev/null +++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/uio.h> + +#include <termios.h> +#include <unistd.h> + +/* + * Make a pre-existing termios structure into "raw" mode: character-at-a-time + * mode with no characters interpreted, 8-bit data path. + */ +void +cfmakeraw(struct termios *t) +{ + t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR| + ICRNL|IXON|IGNPAR); + t->c_iflag |= IGNBRK; + t->c_oflag &= ~OPOST; + t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH| + TOSTOP|PENDIN); + t->c_cflag &= ~(CSIZE|PARENB); + t->c_cflag |= CS8|CREAD; + t->c_cc[VMIN] = 1; + t->c_cc[VTIME] = 0; +} diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c new file mode 100644 index 0000000000..4bd49ded79 --- /dev/null +++ b/usr/src/cmd/bhyve/bhyvegc.c @@ -0,0 +1,103 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "bhyvegc.h" + +struct bhyvegc { + struct bhyvegc_image *gc_image; + int raw; +}; + +struct bhyvegc * +bhyvegc_init(int width, int height, void *fbaddr) +{ + struct bhyvegc *gc; + struct bhyvegc_image *gc_image; + + gc = calloc(1, sizeof (struct bhyvegc)); + + gc_image = calloc(1, sizeof(struct bhyvegc_image)); + gc_image->width = width; + gc_image->height = height; + if (fbaddr) { + gc_image->data = fbaddr; + gc->raw = 1; + } else { + gc_image->data = calloc(width * height, sizeof (uint32_t)); + gc->raw = 0; + } + + gc->gc_image = gc_image; + + return (gc); +} + +void +bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr) +{ + gc->raw = 1; + if (gc->gc_image->data && gc->gc_image->data != fbaddr) + free(gc->gc_image->data); + gc->gc_image->data = fbaddr; +} + +void +bhyvegc_resize(struct bhyvegc *gc, int width, int height) +{ + struct bhyvegc_image *gc_image; + + gc_image = gc->gc_image; + + gc_image->width = width; + gc_image->height = height; + if (!gc->raw) { + gc_image->data = reallocarray(gc_image->data, width * height, + sizeof (uint32_t)); + if (gc_image->data != NULL) + memset(gc_image->data, 0, width * height * + sizeof (uint32_t)); + } +} + +struct bhyvegc_image * +bhyvegc_get_image(struct bhyvegc *gc) +{ + if (gc == NULL) + return (NULL); + + return (gc->gc_image); +} diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h new file mode 100644 index 0000000000..11323586df --- /dev/null +++ b/usr/src/cmd/bhyve/bhyvegc.h @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BHYVEGC_H_ +#define _BHYVEGC_H_ + +struct bhyvegc; + +struct bhyvegc_image { + int vgamode; + int width; + int height; + uint32_t *data; +}; + +struct bhyvegc *bhyvegc_init(int width, int height, void *fbaddr); +void bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr); +void bhyvegc_resize(struct bhyvegc *gc, int width, int height); +struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc); + +#endif /* _BHYVEGC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c new file mode 100644 index 0000000000..ccf89b4613 --- /dev/null +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -0,0 +1,1395 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/mman.h> +#include <sys/time.h> +#include <sys/cpuset.h> + +#ifdef __FreeBSD__ +#include <amd64/vmm/intel/vmcs.h> +#else +#include <intel/vmcs.h> +#endif + +#include <machine/atomic.h> +#include <machine/segments.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <errno.h> +#include <libgen.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> +#include <sysexits.h> +#include <stdbool.h> +#include <stdint.h> + +#include <machine/vmm.h> +#ifndef WITHOUT_CAPSICUM +#include <machine/vmm_dev.h> +#endif +#include <vmmapi.h> + +#ifndef __FreeBSD__ +#include <sys/stat.h> +#endif + +#include "bhyverun.h" +#include "acpi.h" +#include "atkbdc.h" +#include "console.h" +#include "inout.h" +#include "dbgport.h" +#include "fwctl.h" +#include "gdb.h" +#include "ioapic.h" +#include "mem.h" +#include "mevent.h" +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rfb.h" +#include "rtc.h" +#include "vga.h" + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +static const char * const vmx_exit_reason_desc[] = { + [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", + [EXIT_REASON_EXT_INTR] = "External interrupt", + [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", + [EXIT_REASON_INIT] = "INIT signal", + [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", + [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", + [EXIT_REASON_SMI] = "Other SMI", + [EXIT_REASON_INTR_WINDOW] = "Interrupt window", + [EXIT_REASON_NMI_WINDOW] = "NMI window", + [EXIT_REASON_TASK_SWITCH] = "Task switch", + [EXIT_REASON_CPUID] = "CPUID", + [EXIT_REASON_GETSEC] = "GETSEC", + [EXIT_REASON_HLT] = "HLT", + [EXIT_REASON_INVD] = "INVD", + [EXIT_REASON_INVLPG] = "INVLPG", + [EXIT_REASON_RDPMC] = "RDPMC", + [EXIT_REASON_RDTSC] = "RDTSC", + [EXIT_REASON_RSM] = "RSM", + [EXIT_REASON_VMCALL] = "VMCALL", + [EXIT_REASON_VMCLEAR] = "VMCLEAR", + [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", + [EXIT_REASON_VMPTRLD] = "VMPTRLD", + [EXIT_REASON_VMPTRST] = "VMPTRST", + [EXIT_REASON_VMREAD] = "VMREAD", + [EXIT_REASON_VMRESUME] = "VMRESUME", + [EXIT_REASON_VMWRITE] = "VMWRITE", + [EXIT_REASON_VMXOFF] = "VMXOFF", + [EXIT_REASON_VMXON] = "VMXON", + [EXIT_REASON_CR_ACCESS] = "Control-register accesses", + [EXIT_REASON_DR_ACCESS] = "MOV DR", + [EXIT_REASON_INOUT] = "I/O instruction", + [EXIT_REASON_RDMSR] = "RDMSR", + [EXIT_REASON_WRMSR] = "WRMSR", + [EXIT_REASON_INVAL_VMCS] = + "VM-entry failure due to invalid guest state", + [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", + [EXIT_REASON_MWAIT] = "MWAIT", + [EXIT_REASON_MTF] = "Monitor trap flag", + [EXIT_REASON_MONITOR] = "MONITOR", + [EXIT_REASON_PAUSE] = "PAUSE", + [EXIT_REASON_MCE_DURING_ENTRY] = + "VM-entry failure due to machine-check event", + [EXIT_REASON_TPR] = "TPR below threshold", + [EXIT_REASON_APIC_ACCESS] = "APIC access", + [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", + [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", + [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", + [EXIT_REASON_EPT_FAULT] = "EPT violation", + [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", + [EXIT_REASON_INVEPT] = "INVEPT", + [EXIT_REASON_RDTSCP] = "RDTSCP", + [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", + [EXIT_REASON_INVVPID] = "INVVPID", + [EXIT_REASON_WBINVD] = "WBINVD", + [EXIT_REASON_XSETBV] = "XSETBV", + [EXIT_REASON_APIC_WRITE] = "APIC write", + [EXIT_REASON_RDRAND] = "RDRAND", + [EXIT_REASON_INVPCID] = "INVPCID", + [EXIT_REASON_VMFUNC] = "VMFUNC", + [EXIT_REASON_ENCLS] = "ENCLS", + [EXIT_REASON_RDSEED] = "RDSEED", + [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", + [EXIT_REASON_XSAVES] = "XSAVES", + [EXIT_REASON_XRSTORS] = "XRSTORS" +}; + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); + +char *vmname; + +int guest_ncpus; +uint16_t cores, maxcpus, sockets, threads; + +char *guest_uuid_str; + +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; +static int virtio_msix = 1; +static int x2apic_mode = 0; /* default is xAPIC */ + +static int strictio; +static int strictmsr = 1; + +static int acpi; + +static char *progname; +static const int BSP = 0; + +#ifndef __FreeBSD__ +int bcons_wait = 0; +int bcons_connected = 0; +pthread_mutex_t bcons_wait_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t bcons_wait_done = PTHREAD_COND_INITIALIZER; +#endif + +static cpuset_t cpumask; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); + +static struct vm_exit vmexit[VM_MAXCPU]; + +struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_reqidle; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +#ifdef __FreeBSD__ +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; +#endif + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-abehuwxACHPSWY]\n" + " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" + " %*s [-g <gdb port>] [-l <lpc>]\n" +#ifdef __FreeBSD__ + " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n" +#else + " %*s [-m mem] [-s <pci>] [-U uuid] <vm>\n" +#endif + " -a: local apic is in xAPIC mode (deprecated)\n" + " -A: create ACPI tables\n" + " -c: number of cpus and/or topology specification\n" + " -C: include guest memory in core file\n" +#ifndef __FreeBSD__ + " -d: suspend cpu at boot\n" +#endif + " -e: exit on unhandled I/O access\n" + " -g: gdb port\n" + " -h: help\n" + " -H: vmexit from the guest on hlt\n" + " -l: LPC device configuration\n" + " -m: memory size\n" +#ifdef __FreeBSD__ + " -p: pin 'vcpu' to 'hostcpu'\n" +#endif + " -P: vmexit from the guest on pause\n" + " -s: <slot,driver,configinfo> PCI slot config\n" + " -S: guest memory cannot be swapped\n" + " -u: RTC keeps UTC time\n" + " -U: uuid\n" + " -w: ignore unimplemented MSRs\n" + " -W: force virtio to use single-vector MSI\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n", + progname, (int)strlen(progname), "", (int)strlen(progname), "", + (int)strlen(progname), ""); + + exit(code); +} + +/* + * XXX This parser is known to have the following issues: + * 1. It accepts null key=value tokens ",,". + * 2. It accepts whitespace after = and before value. + * 3. Values out of range of INT are silently wrapped. + * 4. It doesn't check non-final values. + * 5. The apparently bogus limits of UINT16_MAX are for future expansion. + * + * The acceptance of a null specification ('-c ""') is by design to match the + * manual page syntax specification, this results in a topology of 1 vCPU. + */ +static int +topology_parse(const char *opt) +{ + uint64_t ncpus; + int c, chk, n, s, t, tmp; + char *cp, *str; + bool ns, scts; + + c = 1, n = 1, s = 1, t = 1; + ns = false, scts = false; + str = strdup(opt); + if (str == NULL) + goto out; + + while ((cp = strsep(&str, ",")) != NULL) { + if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { + s = tmp; + scts = true; + } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { + c = tmp; + scts = true; + } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { + t = tmp; + scts = true; +#ifdef notyet /* Do not expose this until vmm.ko implements it */ + } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { + m = tmp; +#endif + /* Skip the empty argument case from -c "" */ + } else if (cp[0] == '\0') + continue; + else + goto out; + /* Any trailing garbage causes an error */ + if (cp[chk] != '\0') + goto out; + } + free(str); + str = NULL; + + /* + * Range check 1 <= n <= UINT16_MAX all values + */ + if (n < 1 || s < 1 || c < 1 || t < 1 || + n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || + t > UINT16_MAX) + return (-1); + + /* If only the cpus was specified, use that as sockets */ + if (!scts) + s = n; + /* + * Compute sockets * cores * threads avoiding overflow + * The range check above insures these are 16 bit values + * If n was specified check it against computed ncpus + */ + ncpus = (uint64_t)s * c * t; + if (ncpus > UINT16_MAX || (ns && n != ncpus)) + return (-1); + + guest_ncpus = ncpus; + sockets = s; + cores = c; + threads = t; + return(0); + +out: + free(str); + return (-1); +} + +#ifndef WITHOUT_CAPSICUM +/* + * 11-stable capsicum helpers + */ +static void +bhyve_caph_cache_catpages(void) +{ + + (void)catopen("libc", NL_CAT_LOCALE); +} + +static int +bhyve_caph_limit_stdoe(void) +{ + cap_rights_t rights; + unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ }; + int i, fds[] = { STDOUT_FILENO, STDERR_FILENO }; + + cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL); + cap_rights_set(&rights, CAP_WRITE); + + for (i = 0; i < nitems(fds); i++) { + if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS) + return (-1); + + if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS) + return (-1); + + if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS) + return (-1); + } + + return (0); +} + +#endif + +#ifdef __FreeBSD__ +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} +#endif + +void +vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, + int errcode) +{ + struct vmctx *ctx; + int error, restart_instruction; + + ctx = arg; + restart_instruction = 1; + + error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); +} + +void * +paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) +{ + + return (vm_map_gpa(ctx, gaddr, len)); +} + +int +fbsdrun_vmexit_on_pause(void) +{ + + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_virtio_msix(void) +{ + + return (virtio_msix); +} + +static void * +fbsdrun_start_thread(void *param) +{ + char tname[MAXCOMLEN + 1]; + struct mt_vmm_info *mtp; + int vcpu; + + mtp = param; + vcpu = mtp->mt_vcpu; + + snprintf(tname, sizeof(tname), "vcpu %d", vcpu); + pthread_set_name_np(mtp->mt_thr, tname); + + gdb_cpu_add(vcpu); + + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + + /* not reached */ + exit(1); + return (NULL); +} + +#ifdef __FreeBSD__ +void +fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) +#else +void +fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, + bool suspend) +#endif +{ + int error; + + assert(fromcpu == BSP); + + /* + * The 'newcpu' must be activated in the context of 'fromcpu'. If + * vm_activate_cpu() is delayed until newcpu's pthread starts running + * then vmm.ko is out-of-sync with bhyve and this can create a race + * with vm_suspend(). + */ + error = vm_activate_cpu(ctx, newcpu); + if (error != 0) + err(EX_OSERR, "could not activate CPU %d", newcpu); + + CPU_SET_ATOMIC(newcpu, &cpumask); + +#ifndef __FreeBSD__ + if (suspend) + (void) vm_suspend_cpu(ctx, newcpu); +#endif + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[newcpu].rip = rip; + vmexit[newcpu].inst_length = 0; + + mt_vmm_info[newcpu].mt_ctx = ctx; + mt_vmm_info[newcpu].mt_vcpu = newcpu; + + error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[newcpu]); + assert(error == 0); +} + +static int +fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) +{ + + if (!CPU_ISSET(vcpu, &cpumask)) { + fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + exit(4); + } + + CPU_CLR_ATOMIC(vcpu, &cpumask); + return (CPU_EMPTY(&cpumask)); +} + +static int +vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, + uint32_t eax) +{ +#if BHYVE_DEBUG + /* + * put guest-driven debug here + */ +#endif + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + in = vme->u.inout.in; + out = !in; + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + return (error); + } + + error = emulate_inout(ctx, vcpu, vme, strictio); + if (error) { + fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), + port, vmexit->rip); + return (VMEXIT_ABORT); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + uint64_t val; + uint32_t eax, edx; + int error; + + val = 0; + error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); + if (error != 0) { + fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", + vme->u.msr.code, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + + eax = val; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); + assert(error == 0); + + edx = val >> 32; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); + assert(error == 0); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + + error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); + if (error != 0) { + fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", + vme->u.msr.code, vme->u.msr.wval, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + + (void)spinup_ap(ctx, *pvcpu, + vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (VMEXIT_CONTINUE); +} + +#define DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif + +static const char * +vmexit_vmx_desc(uint32_t exit_reason) +{ + + if (exit_reason >= nitems(vmx_exit_reason_desc) || + vmx_exit_reason_desc[exit_reason] == NULL) + return ("Unknown"); + return (vmx_exit_reason_desc[exit_reason]); +} + +static int +vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tVMX\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); + fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, + vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); + fprintf(stderr, "\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); + fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); +#ifdef DEBUG_EPT_MISCONFIG + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vm_get_register(ctx, *pvcpu, + VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), + &ept_misconfig_gpa); + vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, + &ept_misconfig_ptenum); + fprintf(stderr, "\tEPT misconfiguration:\n"); + fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); + fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + ept_misconfig_ptenum, ept_misconfig_pte[0], + ept_misconfig_pte[1], ept_misconfig_pte[2], + ept_misconfig_pte[3]); + } +#endif /* DEBUG_EPT_MISCONFIG */ + return (VMEXIT_ABORT); +} + +static int +vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tSVM\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); + fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); + fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_bogus++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_reqidle++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_hlt++; + + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); +} + +static int +vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_pause++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_mtrap++; + + gdb_cpu_mtrap(*pvcpu); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int err, i; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vmexit->u.inst_emul.vie; + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, + vie, &vmexit->u.inst_emul.paging); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", + vmexit->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction ["); + for (i = 0; i < vie->num_valid; i++) { + fprintf(stderr, "0x%02x%s", vie->inst[i], + i != (vie->num_valid - 1) ? " " : ""); + } + fprintf(stderr, "] at 0x%lx\n", vmexit->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + fbsdrun_deletecpu(ctx, *pvcpu); + + if (*pvcpu != BSP) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + + switch (how) { + case VM_SUSPEND_RESET: + exit(0); + case VM_SUSPEND_POWEROFF: + exit(1); + case VM_SUSPEND_HALT: + exit(2); + case VM_SUSPEND_TRIPLEFAULT: + exit(3); + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + exit(100); + } + return (0); /* NOTREACHED */ +} + +static int +vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + gdb_cpu_suspend(*pvcpu); + return (VMEXIT_CONTINUE); +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_SVM] = vmexit_svm, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_REQIDLE] = vmexit_reqidle, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, + [VM_EXITCODE_DEBUG] = vmexit_debug, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) +{ + int error, rc; + enum vm_exitcode exitcode; + cpuset_t active_cpus; + +#ifdef __FreeBSD__ + if (vcpumap[vcpu] != NULL) { + error = pthread_setaffinity_np(pthread_self(), + sizeof(cpuset_t), vcpumap[vcpu]); + assert(error == 0); + } +#endif + error = vm_active_cpus(ctx, &active_cpus); + assert(CPU_ISSET(vcpu, &active_cpus)); + + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + + while (1) { + error = vm_run(ctx, vcpu, &vmexit[vcpu]); + if (error != 0) + break; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(4); + } + + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + break; + case VMEXIT_ABORT: + abort(); + default: + exit(4); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + +static int +num_vcpus_allowed(struct vmctx *ctx) +{ + int tmp, error; + + error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); + + /* + * The guest is allowed to spinup more than one processor only if the + * UNRESTRICTED_GUEST capability is available. + */ + if (error == 0) + return (VM_MAXCPU); + else + return (1); +} + +void +fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) +{ + int err, tmp; + + if (fbsdrun_vmexit_on_hlt()) { + err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, "VM exit on HLT not supported\n"); + exit(4); + } + vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, + "SMP mux requested, no pause support\n"); + exit(4); + } + vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (x2apic_mode) + err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); + else + err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); + + if (err) { + fprintf(stderr, "Unable to set x2apic state (%d)\n", err); + exit(4); + } + +#ifdef __FreeBSD__ + vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); +#endif +} + +static struct vmctx * +do_open(const char *vmname) +{ + struct vmctx *ctx; + int error; + bool reinit, romboot; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + const cap_ioctl_t *cmds; + size_t ncmds; +#endif + + reinit = romboot = false; + + if (lpc_bootrom()) + romboot = true; + + error = vm_create(vmname); + if (error) { + if (errno == EEXIST) { + if (romboot) { + reinit = true; + } else { + /* + * The virtual machine has been setup by the + * userspace bootloader. + */ + } + } else { + perror("vm_create"); + exit(4); + } + } else { + if (!romboot) { + /* + * If the virtual machine was just created then a + * bootrom must be configured to boot it. + */ + fprintf(stderr, "virtual machine cannot be booted\n"); + exit(4); + } + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); + if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + vm_get_ioctls(&ncmds); + cmds = vm_get_ioctls(NULL); + if (cmds == NULL) + errx(EX_OSERR, "out of memory"); + if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + free((cap_ioctl_t *)cmds); +#endif + + if (reinit) { + error = vm_reinit(ctx); + if (error) { + perror("vm_reinit"); + exit(4); + } + } + error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); + if (error) + errx(EX_OSERR, "vm_set_topology"); + return (ctx); +} + +#ifndef __FreeBSD__ + +#define FILE_PROVISIONING "/var/svc/provisioning" +#define FILE_PROVISION_SUCCESS "/var/svc/provision_success" + +static void +mark_provisioned(void) +{ + struct stat stbuf; + + if (lstat(FILE_PROVISIONING, &stbuf) != 0) + return; + + if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) { + (void) fprintf(stderr, "Cannot rename %s to %s: %s\n", + FILE_PROVISIONING, FILE_PROVISION_SUCCESS, + strerror(errno)); + } +} + +#endif + +int +main(int argc, char *argv[]) +{ + int c, error, dbg_port, gdb_port, err, bvmcons; + int max_vcpus, mptgen, memflags; + int rtc_localtime; + bool gdb_stop; +#ifndef __FreeBSD__ + bool suspend = false; +#endif + struct vmctx *ctx; + uint64_t rip; + size_t memsize; + char *optstr; + + bvmcons = 0; + progname = basename(argv[0]); + dbg_port = 0; + gdb_port = 0; + gdb_stop = false; + guest_ncpus = 1; + sockets = cores = threads = 1; + maxcpus = 0; + memsize = 256 * MB; + mptgen = 1; + rtc_localtime = 1; + memflags = 0; + +#ifdef __FreeBSD__ + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:"; +#else + optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:"; +#endif + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 'a': + x2apic_mode = 0; + break; + case 'A': + acpi = 1; + break; + case 'b': + bvmcons = 1; + break; + case 'B': + if (smbios_parse(optarg) != 0) { + errx(EX_USAGE, "invalid SMBIOS " + "configuration '%s'", optarg); + } + break; +#ifndef __FreeBSD__ + case 'd': + suspend = true; + break; +#else + case 'p': + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } + break; +#endif + case 'c': + if (topology_parse(optarg) != 0) { + errx(EX_USAGE, "invalid cpu topology " + "'%s'", optarg); + } + break; + case 'C': + memflags |= VM_MEM_F_INCORE; + break; + case 'g': + dbg_port = atoi(optarg); + break; + case 'G': + if (optarg[0] == 'w') { + gdb_stop = true; + optarg++; + } + gdb_port = atoi(optarg); + break; + case 'l': + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + lpc_print_supported_devices(); + exit(0); + } else if (lpc_device_parse(optarg) != 0) { + errx(EX_USAGE, "invalid lpc device " + "configuration '%s'", optarg); + } + break; + case 's': + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + pci_print_supported_devices(); + exit(0); + } else if (pci_parse_slot(optarg) != 0) + exit(4); + else + break; + case 'S': + memflags |= VM_MEM_F_WIRED; + break; + case 'm': + error = vm_parse_memsize(optarg, &memsize); + if (error) + errx(EX_USAGE, "invalid memsize '%s'", optarg); + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'I': + /* + * The "-I" option was used to add an ioapic to the + * virtual machine. + * + * An ioapic is now provided unconditionally for each + * virtual machine and this option is now deprecated. + */ + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'e': + strictio = 1; + break; + case 'u': + rtc_localtime = 0; + break; + case 'U': + guest_uuid_str = optarg; + break; + case 'w': + strictmsr = 0; + break; + case 'W': + virtio_msix = 0; + break; + case 'x': + x2apic_mode = 1; + break; + case 'Y': + mptgen = 0; + break; + case 'h': + usage(0); + default: + usage(1); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(1); + + vmname = argv[0]; + ctx = do_open(vmname); + + max_vcpus = num_vcpus_allowed(ctx); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(4); + } + + fbsdrun_set_capabilities(ctx, BSP); + + vm_set_memflags(ctx, memflags); +#ifdef __FreeBSD__ + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); +#else + do { + errno = 0; + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = errno; + if (err != 0 && error == ENOMEM) { + (void) fprintf(stderr, "Unable to allocate memory " + "(%llu), retrying in 1 second\n", memsize); + sleep(1); + } + } while (error == ENOMEM); +#endif + if (err) { + fprintf(stderr, "Unable to setup memory (%d)\n", errno); + exit(4); + } + + error = init_msr(); + if (error) { + fprintf(stderr, "init_msr error %d", error); + exit(4); + } + + init_mem(); + init_inout(); + atkbdc_init(ctx); + pci_irq_init(ctx); + ioapic_init(ctx); + + rtc_init(ctx, rtc_localtime); + sci_init(ctx); + + /* + * Exit if a device emulation finds an error in its initilization + */ + if (init_pci(ctx) != 0) { + perror("device emulation initialization error"); + exit(4); + } + + if (dbg_port != 0) + init_dbgport(dbg_port); + + if (gdb_port != 0) + init_gdb(ctx, gdb_port, gdb_stop); + + if (bvmcons) + init_bvmcons(); + + vga_init(1); + + if (lpc_bootrom()) { + if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { + fprintf(stderr, "ROM boot failed: unrestricted guest " + "capability not available\n"); + exit(4); + } + error = vcpu_reset(ctx, BSP); + assert(error == 0); + } + + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + /* + * build the guest tables, MP etc. + */ + if (mptgen) { + error = mptable_build(ctx, guest_ncpus); + if (error) { + perror("error to build the guest tables"); + exit(4); + } + } + + error = smbios_build(ctx); + assert(error == 0); + + if (acpi) { + error = acpi_build(ctx, guest_ncpus); + assert(error == 0); + } + + if (lpc_bootrom()) + fwctl_init(); + + /* + * Change the proc title to include the VM name. + */ + setproctitle("%s", vmname); + +#ifndef WITHOUT_CAPSICUM + caph_cache_catpages(); + + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (caph_enter() == -1) + errx(EX_OSERR, "cap_enter() failed"); +#endif + +#ifndef __FreeBSD__ + /* + * If applicable, wait for bhyveconsole + */ + if (bcons_wait) { + printf("Waiting for bhyveconsole connection...\n"); + (void) pthread_mutex_lock(&bcons_wait_lock); + while (!bcons_connected) { + (void) pthread_cond_wait(&bcons_wait_done, + &bcons_wait_lock); + } + (void) pthread_mutex_unlock(&bcons_wait_lock); + } +#endif + + /* + * Add CPU 0 + */ +#ifdef __FreeBSD__ + fbsdrun_addcpu(ctx, BSP, BSP, rip); +#else + fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend); + + mark_provisioned(); +#endif + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(4); +} diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h new file mode 100644 index 0000000000..cdde04862c --- /dev/null +++ b/usr/src/cmd/bhyve/bhyverun.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) + +struct vmctx; +extern int guest_ncpus; +extern char *guest_uuid_str; +extern char *vmname; +#ifndef __FreeBSD__ +extern int bcons_wait; +extern int bcons_connected; +extern pthread_mutex_t bcons_wait_lock; +extern pthread_cond_t bcons_wait_done; +#endif + +void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); + +void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); +#ifdef __FreeBSD__ +void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); +#else +void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, + bool suspend); +#endif +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_disable_x2apic(void); +int fbsdrun_virtio_msix(void); +#endif diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c new file mode 100644 index 0000000000..fcb4149b62 --- /dev/null +++ b/usr/src/cmd/bhyve/block_if.c @@ -0,0 +1,1028 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/queue.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/disk.h> +#include <sys/limits.h> +#include <sys/uio.h> +#ifndef __FreeBSD__ +#include <sys/dkio.h> +#endif + +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <pthread_np.h> +#include <signal.h> +#include <sysexits.h> +#include <unistd.h> + +#include <machine/atomic.h> + +#include "bhyverun.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "block_if.h" + +#define BLOCKIF_SIG 0xb109b109 + +#ifdef __FreeBSD__ +#define BLOCKIF_NUMTHR 8 +#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) +#else +/* Enlarge to keep pace with the virtio-block ring size */ +#define BLOCKIF_NUMTHR 16 +#define BLOCKIF_MAXREQ (128 + BLOCKIF_NUMTHR) +#endif + +enum blockop { + BOP_READ, + BOP_WRITE, +#ifndef __FreeBSD__ + BOP_WRITE_SYNC, +#endif + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +#ifndef __FreeBSD__ +enum blockif_wce { + WCE_NONE = 0, + WCE_IOCTL, + WCE_FCNTL +}; +#endif + +struct blockif_ctxt { + int bc_magic; + int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; +#ifndef __FreeBSD__ + enum blockif_wce bc_wce; +#endif + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be, *tbe; + off_t off; + int i; + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_req = breq; + be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: +#ifndef __FreeBSD__ + case BOP_WRITE_SYNC: +#endif + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + return (be->be_status == BST_PEND); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = t; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + *bep = be; + return (1); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_elem *tbe; + + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +{ + struct blockif_req *br; +#ifdef __FreeBSD__ + off_t arg[2]; +#endif + ssize_t clen, len, off, boff, voff; + int i, err; + + br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; + err = 0; + switch (be->be_op) { + case BOP_READ: + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + break; + case BOP_WRITE: + if (bc->bc_rdonly) { + err = EROFS; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + br->br_iov[i].iov_base + voff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + break; + case BOP_FLUSH: +#ifdef __FreeBSD__ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + err = errno; + } else if (fsync(bc->bc_fd)) + err = errno; +#else + /* + * This fsync() should be adequate to flush the cache of a file + * or device. In VFS, the VOP_SYNC operation is converted to + * the appropriate ioctl in both sdev (for real devices) and + * zfs (for zvols). + */ + if (fsync(bc->bc_fd)) + err = errno; +#endif + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; +#ifdef __FreeBSD__ + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } +#endif + else + err = EOPNOTSUPP; + break; + default: + err = EINVAL; + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + pthread_t t; + uint8_t *buf; + + bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + + pthread_mutex_lock(&bc->bc_mtx); + for (;;) { + while (blockif_dequeue(bc, t, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be, buf); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + /* Check ctxt status here to see if exit requested */ + if (bc->bc_closing) + break; + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + } + pthread_mutex_unlock(&bc->bc_mtx); + + if (buf) + free(buf); + pthread_exit(NULL); + return (NULL); +} + +#ifdef __FreeBSD__ +static void +blockif_sigcont_handler(int signal, enum ev_type type, void *arg) +#else +static void +blockif_sigcont_handler(int signal) +#endif +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ +#ifdef __FreeBSD__ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +#else + (void) sigset(SIGCONT, blockif_sigcont_handler); +#endif +} + +struct blockif_ctxt * +blockif_open(const char *optstr, const char *ident) +{ + char tname[MAXCOMLEN + 1]; +#ifdef __FreeBSD__ + char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; +#else + char *nopt, *xopts, *cp = NULL; +#endif + struct blockif_ctxt *bc; + struct stat sbuf; +#ifdef __FreeBSD__ + struct diocgattr_arg arg; +#else + enum blockif_wce wce = WCE_NONE; +#endif + off_t size, psectsz, psectoff; + int extra, fd, i, sectsz; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; +#endif + + pthread_once(&blockif_once, blockif_init); + + fd = -1; + ssopt = 0; + nocache = 0; + sync = 0; + ro = 0; + + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) + nocache = 1; + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) + sync = 1; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } + } + + extra = 0; + if (nocache) + extra |= O_DIRECT; + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + warn("Could not open backing file: %s", nopt); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + warn("Could not stat backing file %s", nopt); + goto err; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; +#ifdef __FreeBSD__ + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + goto err; + } + assert(size != 0); + assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else { + psectsz = sbuf.st_blksize; + } +#else + psectsz = sbuf.st_blksize; + if (S_ISCHR(sbuf.st_mode)) { + struct dk_minfo_ext dkmext; + int wce_val; + + /* Look for a more accurate physical blocksize */ + if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { + psectsz = dkmext.dki_pbsize; + } + /* See if a configurable write cache is present and working */ + if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { + /* + * If WCE is already active, disable it until the + * specific device driver calls for its return. If it + * is not active, toggle it on and off to verify that + * such actions are possible. + */ + if (wce_val != 0) { + wce_val = 0; + /* + * Inability to disable the cache is a threat + * to data durability. + */ + assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); + wce = WCE_IOCTL; + } else { + int r1, r2; + + wce_val = 1; + r1 = ioctl(fd, DKIOCSETWCE, &wce_val); + wce_val = 0; + r2 = ioctl(fd, DKIOCSETWCE, &wce_val); + + if (r1 == 0 && r2 == 0) { + wce = WCE_IOCTL; + } else { + /* + * If the cache cache toggle was not + * successful, ensure that the cache + * was not left enabled. + */ + assert(r1 != 0); + } + } + } + } else { + int flags; + + if ((flags = fcntl(fd, F_GETFL)) >= 0) { + flags |= O_DSYNC; + if (fcntl(fd, F_SETFL, flags) != -1) { + wce = WCE_FCNTL; + } + } + } +#endif + +#ifndef WITHOUT_CAPSICUM + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + fprintf(stderr, "Sector size %d incompatible " + "with underlying device sector size %d\n", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_magic = BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; +#ifndef __FreeBSD__ + bc->bc_wce = wce; +#endif + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } + + return (bc); +err: + if (fd >= 0) + close(fd); +#ifdef __FreeBSD__ + free(cp); + free(xopts); + free(nopt); +#else + free(nopt); +#endif + return (NULL); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (!TAILQ_EMPTY(&bc->bc_freeq)) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + blockif_complete(bc, be); + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int i; + + assert(bc->bc_magic == BLOCKIF_SIG); + + /* + * Stop the block i/o thread + */ + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_closing = 1; + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535UL*16*255) + sectors = 65535UL*16*255; + + if (sectors >= 65536UL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_sectsz); +} + +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_rdonly); +} + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} + +#ifndef __FreeBSD__ +int +blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) +{ + int res = 0, flags; + int clean_val = (wc_enable != 0) ? 1 : 0; + + (void) pthread_mutex_lock(&bc->bc_mtx); + switch (bc->bc_wce) { + case WCE_IOCTL: + res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); + break; + case WCE_FCNTL: + if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { + if (wc_enable == 0) { + flags |= O_DSYNC; + } else { + flags &= ~O_DSYNC; + } + if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { + res = -1; + } + } else { + res = -1; + } + break; + default: + break; + } + + /* + * After a successful disable of the write cache, ensure that any + * lingering data in the cache is synced out. + */ + if (res == 0 && wc_enable == 0) { + res = fsync(bc->bc_fd); + } + (void) pthread_mutex_unlock(&bc->bc_mtx); + + return (res); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h new file mode 100644 index 0000000000..8401cd9529 --- /dev/null +++ b/usr/src/cmd/bhyve/block_if.h @@ -0,0 +1,84 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#ifndef _BLOCK_IF_H_ +#define _BLOCK_IF_H_ + +#include <sys/uio.h> +#include <sys/unistd.h> + +#ifdef __FreeBSD__ +#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */ +#else +/* + * Upstream is in the process of bumping this up to 128 for several reasons, + * including Windows compatibility. For the sake of our Windows support, we + * will use the higher value now. + */ +#define BLOCKIF_IOV_MAX 128 +#endif + +struct blockif_req { + int br_iovcnt; + off_t br_offset; + ssize_t br_resid; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; +}; + +struct blockif_ctxt; +struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +#ifndef __FreeBSD__ +int blockif_set_wce(struct blockif_ctxt *bc, int enable); +#endif +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); + +#endif /* _BLOCK_IF_H_ */ diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c new file mode 100644 index 0000000000..b8c63828c8 --- /dev/null +++ b/usr/src/cmd/bhyve/bootrom.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <machine/vmm.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <stdbool.h> + +#include <vmmapi.h> +#include "bhyverun.h" +#include "bootrom.h" + +#define MAX_BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ + +int +bootrom_init(struct vmctx *ctx, const char *romfile) +{ + struct stat sbuf; + vm_paddr_t gpa; + ssize_t rlen; + char *ptr; + int fd, i, rv, prot; + + rv = -1; + fd = open(romfile, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "Error opening bootrom \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + if (fstat(fd, &sbuf) < 0) { + fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + /* + * Limit bootrom size to 16MB so it doesn't encroach into reserved + * MMIO space (e.g. APIC, HPET, MSI). + */ + if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) { + fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size); + goto done; + } + + if (sbuf.st_size & PAGE_MASK) { + fprintf(stderr, "Bootrom size %ld is not a multiple of the " + "page size\n", sbuf.st_size); + goto done; + } + + ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size); + if (ptr == MAP_FAILED) + goto done; + + /* Map the bootrom into the guest address space */ + prot = PROT_READ | PROT_EXEC; + gpa = (1ULL << 32) - sbuf.st_size; + if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0) + goto done; + + /* Read 'romfile' into the guest address space */ + for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) { + rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); + if (rlen != PAGE_SIZE) { + fprintf(stderr, "Incomplete read of page %d of bootrom " + "file %s: %ld bytes\n", i, romfile, rlen); + goto done; + } + } + rv = 0; +done: + if (fd >= 0) + close(fd); + return (rv); +} diff --git a/usr/src/cmd/bhyve/bootrom.h b/usr/src/cmd/bhyve/bootrom.h new file mode 100644 index 0000000000..7fb12181dd --- /dev/null +++ b/usr/src/cmd/bhyve/bootrom.h @@ -0,0 +1,40 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BOOTROM_H_ +#define _BOOTROM_H_ + +#include <stdbool.h> + +struct vmctx; + +int bootrom_init(struct vmctx *ctx, const char *romfile); + +#endif diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c new file mode 100644 index 0000000000..2567f69959 --- /dev/null +++ b/usr/src/cmd/bhyve/console.c @@ -0,0 +1,120 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include "bhyvegc.h" +#include "console.h" + +static struct { + struct bhyvegc *gc; + + fb_render_func_t fb_render_cb; + void *fb_arg; + + kbd_event_func_t kbd_event_cb; + void *kbd_arg; + int kbd_priority; + + ptr_event_func_t ptr_event_cb; + void *ptr_arg; + int ptr_priority; +} console; + +void +console_init(int w, int h, void *fbaddr) +{ + console.gc = bhyvegc_init(w, h, fbaddr); +} + +void +console_set_fbaddr(void *fbaddr) +{ + bhyvegc_set_fbaddr(console.gc, fbaddr); +} + +struct bhyvegc_image * +console_get_image(void) +{ + struct bhyvegc_image *bhyvegc_image; + + bhyvegc_image = bhyvegc_get_image(console.gc); + + return (bhyvegc_image); +} + +void +console_fb_register(fb_render_func_t render_cb, void *arg) +{ + console.fb_render_cb = render_cb; + console.fb_arg = arg; +} + +void +console_refresh(void) +{ + if (console.fb_render_cb) + (*console.fb_render_cb)(console.gc, console.fb_arg); +} + +void +console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri) +{ + if (pri > console.kbd_priority) { + console.kbd_event_cb = event_cb; + console.kbd_arg = arg; + console.kbd_priority = pri; + } +} + +void +console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri) +{ + if (pri > console.ptr_priority) { + console.ptr_event_cb = event_cb; + console.ptr_arg = arg; + console.ptr_priority = pri; + } +} + +void +console_key_event(int down, uint32_t keysym) +{ + if (console.kbd_event_cb) + (*console.kbd_event_cb)(down, keysym, console.kbd_arg); +} + +void +console_ptr_event(uint8_t button, int x, int y) +{ + if (console.ptr_event_cb) + (*console.ptr_event_cb)(button, x, y, console.ptr_arg); +} diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h new file mode 100644 index 0000000000..0d0a854866 --- /dev/null +++ b/usr/src/cmd/bhyve/console.h @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CONSOLE_H_ +#define _CONSOLE_H_ + +struct bhyvegc; + +typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg); +typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg); +typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg); + +void console_init(int w, int h, void *fbaddr); + +void console_set_fbaddr(void *fbaddr); + +struct bhyvegc_image *console_get_image(void); + +void console_fb_register(fb_render_func_t render_cb, void *arg); +void console_refresh(void); + +void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri); +void console_key_event(int down, uint32_t keysym); + +void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri); +void console_ptr_event(uint8_t button, int x, int y); + +#endif /* _CONSOLE_H_ */ diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c new file mode 100644 index 0000000000..cda2df2414 --- /dev/null +++ b/usr/src/cmd/bhyve/consport.c @@ -0,0 +1,180 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/select.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <termios.h> +#include <unistd.h> +#include <stdbool.h> +#include <sysexits.h> + +#include "inout.h" +#include "pci_lpc.h" + +#define BVM_CONSOLE_PORT 0x220 +#define BVM_CONS_SIG ('b' << 8 | 'v') + +#ifdef __FreeBSD__ +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} +#endif + +static void +ttyopen(void) +{ +#ifdef __FreeBSD__ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +#endif +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static int opened; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif + + if (bytes == 2 && in) { + *eax = BVM_CONS_SIG; + return (0); + } + + /* + * Guests might probe this port to look for old ISA devices + * using single-byte reads. Return 0xff for those. + */ + if (bytes == 1 && in) { + *eax = 0xff; + return (0); + } + + if (bytes != 4) + return (-1); + + if (!opened) { +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, + CAP_WRITE); + if (caph_rights_limit(STDIN_FILENO, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + ttyopen(); + opened = 1; + } + + if (in) + *eax = ttyread(); + else + ttywrite(*eax); + + return (0); +} + +SYSRES_IO(BVM_CONSOLE_PORT, 4); + +static struct inout_port consport = { + "bvmcons", + BVM_CONSOLE_PORT, + 1, + IOPORT_F_INOUT, + console_handler +}; + +void +init_bvmcons(void) +{ + + register_inout(&consport); +} diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c new file mode 100644 index 0000000000..88a616b50d --- /dev/null +++ b/usr/src/cmd/bhyve/dbgport.c @@ -0,0 +1,180 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sys/uio.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <stdio.h> +#include <stdlib.h> +#include <sysexits.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> + +#include "inout.h" +#include "dbgport.h" +#include "pci_lpc.h" + +#define BVM_DBG_PORT 0x224 +#define BVM_DBG_SIG ('B' << 8 | 'V') + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int nwritten, nread, printonce; + int on = 1; + char ch; + + if (bytes == 2 && in) { + *eax = BVM_DBG_SIG; + return (0); + } + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK); + if (conn_fd >= 0) { + /* Avoid EPIPE after the client drops off. */ + (void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE, + &on, sizeof(on)); + /* Improve latency for one byte at a time tranfers. */ + (void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, + &on, sizeof(on)); + } else if (errno != EINTR) { + perror("accept"); + } + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +static struct inout_port dbgport = { + "bvmdbg", + BVM_DBG_PORT, + 1, + IOPORT_F_INOUT, + dbg_handler +}; + +SYSRES_IO(BVM_DBG_PORT, 4); + +void +init_dbgport(int sport) +{ + int reuse; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + reuse = 1; + if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, + sizeof(reuse)) < 0) { + perror("cannot set socket options"); + exit(4); + } + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(listen_fd, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(listen_fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + register_inout(&dbgport); +} diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h new file mode 100644 index 0000000000..407ff3ffbf --- /dev/null +++ b/usr/src/cmd/bhyve/dbgport.h @@ -0,0 +1,36 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DBGPORT_H_ +#define _DBGPORT_H_ + +void init_dbgport(int port); + +#endif diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c new file mode 100644 index 0000000000..0640bc28ba --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.c @@ -0,0 +1,552 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does, + * but with a request/response messaging protocol. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/uio.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "bhyverun.h" +#include "inout.h" +#include "fwctl.h" + +/* + * Messaging protocol base operations + */ +#define OP_NULL 1 +#define OP_ECHO 2 +#define OP_GET 3 +#define OP_GET_LEN 4 +#define OP_SET 5 +#define OP_MAX OP_SET + +/* I/O ports */ +#define FWCTL_OUT 0x510 +#define FWCTL_IN 0x511 + +/* + * Back-end state-machine + */ +enum state { + DORMANT, + IDENT_WAIT, + IDENT_SEND, + REQ, + RESP +} be_state = DORMANT; + +static uint8_t sig[] = { 'B', 'H', 'Y', 'V' }; +static u_int ident_idx; + +struct op_info { + int op; + int (*op_start)(uint32_t len); + void (*op_data)(uint32_t data, uint32_t len); + int (*op_result)(struct iovec **data); + void (*op_done)(struct iovec *data); +}; +static struct op_info *ops[OP_MAX+1]; + +/* Return 0-padded uint32_t */ +static uint32_t +fwctl_send_rest(uint32_t *data, size_t len) +{ + union { + uint8_t c[4]; + uint32_t w; + } u; + uint8_t *cdata; + int i; + + cdata = (uint8_t *) data; + u.w = 0; + + for (i = 0, u.w = 0; i < len; i++) + u.c[i] = *cdata++; + + return (u.w); +} + +/* + * error op dummy proto - drop all data sent and return an error +*/ +static int errop_code; + +static void +errop_set(int err) +{ + + errop_code = err; +} + +static int +errop_start(uint32_t len) +{ + errop_code = ENOENT; + + /* accept any length */ + return (errop_code); +} + +static void +errop_data(uint32_t data, uint32_t len) +{ + + /* ignore */ +} + +static int +errop_result(struct iovec **data) +{ + + /* no data to send back; always successful */ + *data = NULL; + return (errop_code); +} + +static void +errop_done(struct iovec *data) +{ + + /* assert data is NULL */ +} + +static struct op_info errop_info = { + .op_start = errop_start, + .op_data = errop_data, + .op_result = errop_result, + .op_done = errop_done +}; + +/* OID search */ +SET_DECLARE(ctl_set, struct ctl); + +CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus)); + +static struct ctl * +ctl_locate(const char *str, int maxlen) +{ + struct ctl *cp, **cpp; + + SET_FOREACH(cpp, ctl_set) { + cp = *cpp; + if (!strncmp(str, cp->c_oid, maxlen)) + return (cp); + } + + return (NULL); +} + +/* uefi-sysctl get-len */ +#define FGET_STRSZ 80 +static struct iovec fget_biov[2]; +static char fget_str[FGET_STRSZ]; +static struct { + size_t f_sz; + uint32_t f_data[1024]; +} fget_buf; +static int fget_cnt; +static size_t fget_size; + +static int +fget_start(uint32_t len) +{ + + if (len > FGET_STRSZ) + return(E2BIG); + + fget_cnt = 0; + + return (0); +} + +static void +fget_data(uint32_t data, uint32_t len) +{ + + *((uint32_t *) &fget_str[fget_cnt]) = data; + fget_cnt += sizeof(uint32_t); +} + +static int +fget_result(struct iovec **data, int val) +{ + struct ctl *cp; + int err; + + err = 0; + + /* Locate the OID */ + cp = ctl_locate(fget_str, fget_cnt); + if (cp == NULL) { + *data = NULL; + err = ENOENT; + } else { + if (val) { + /* For now, copy the len/data into a buffer */ + memset(&fget_buf, 0, sizeof(fget_buf)); + fget_buf.f_sz = cp->c_len; + memcpy(fget_buf.f_data, cp->c_data, cp->c_len); + fget_biov[0].iov_base = (char *)&fget_buf; + fget_biov[0].iov_len = sizeof(fget_buf.f_sz) + + cp->c_len; + } else { + fget_size = cp->c_len; + fget_biov[0].iov_base = (char *)&fget_size; + fget_biov[0].iov_len = sizeof(fget_size); + } + + fget_biov[1].iov_base = NULL; + fget_biov[1].iov_len = 0; + *data = fget_biov; + } + + return (err); +} + +static void +fget_done(struct iovec *data) +{ + + /* nothing needs to be freed */ +} + +static int +fget_len_result(struct iovec **data) +{ + return (fget_result(data, 0)); +} + +static int +fget_val_result(struct iovec **data) +{ + return (fget_result(data, 1)); +} + +static struct op_info fgetlen_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_len_result, + .op_done = fget_done +}; + +static struct op_info fgetval_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_val_result, + .op_done = fget_done +}; + +static struct req_info { + int req_error; + u_int req_count; + uint32_t req_size; + uint32_t req_type; + uint32_t req_txid; + struct op_info *req_op; + int resp_error; + int resp_count; + size_t resp_size; + size_t resp_off; + struct iovec *resp_biov; +} rinfo; + +static void +fwctl_response_done(void) +{ + + (*rinfo.req_op->op_done)(rinfo.resp_biov); + + /* reinit the req data struct */ + memset(&rinfo, 0, sizeof(rinfo)); +} + +static void +fwctl_request_done(void) +{ + + rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov); + + /* XXX only a single vector supported at the moment */ + rinfo.resp_off = 0; + if (rinfo.resp_biov == NULL) { + rinfo.resp_size = 0; + } else { + rinfo.resp_size = rinfo.resp_biov[0].iov_len; + } +} + +static int +fwctl_request_start(void) +{ + int err; + + /* Data size doesn't include header */ + rinfo.req_size -= 12; + + rinfo.req_op = &errop_info; + if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL) + rinfo.req_op = ops[rinfo.req_type]; + + err = (*rinfo.req_op->op_start)(rinfo.req_size); + + if (err) { + errop_set(err); + rinfo.req_op = &errop_info; + } + + /* Catch case of zero-length message here */ + if (rinfo.req_size == 0) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request_data(uint32_t value) +{ + + /* Make sure remaining size is >= 0 */ + if (rinfo.req_size <= sizeof(uint32_t)) + rinfo.req_size = 0; + else + rinfo.req_size -= sizeof(uint32_t); + + (*rinfo.req_op->op_data)(value, rinfo.req_size); + + if (rinfo.req_size < sizeof(uint32_t)) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request(uint32_t value) +{ + + int ret; + + ret = 0; + + switch (rinfo.req_count) { + case 0: + /* Verify size */ + if (value < 12) { + printf("msg size error"); + exit(4); + } + rinfo.req_size = value; + rinfo.req_count = 1; + break; + case 1: + rinfo.req_type = value; + rinfo.req_count++; + break; + case 2: + rinfo.req_txid = value; + rinfo.req_count++; + ret = fwctl_request_start(); + break; + default: + ret = fwctl_request_data(value); + break; + } + + return (ret); +} + +static int +fwctl_response(uint32_t *retval) +{ + uint32_t *dp; + ssize_t remlen; + + switch(rinfo.resp_count) { + case 0: + /* 4 x u32 header len + data */ + *retval = 4*sizeof(uint32_t) + + roundup(rinfo.resp_size, sizeof(uint32_t)); + rinfo.resp_count++; + break; + case 1: + *retval = rinfo.req_type; + rinfo.resp_count++; + break; + case 2: + *retval = rinfo.req_txid; + rinfo.resp_count++; + break; + case 3: + *retval = rinfo.resp_error; + rinfo.resp_count++; + break; + default: + remlen = rinfo.resp_size - rinfo.resp_off; + dp = (uint32_t *) + ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off); + if (remlen >= sizeof(uint32_t)) { + *retval = *dp; + } else if (remlen > 0) { + *retval = fwctl_send_rest(dp, remlen); + } + rinfo.resp_off += sizeof(uint32_t); + break; + } + + if (rinfo.resp_count > 3 && + rinfo.resp_off >= rinfo.resp_size) { + fwctl_response_done(); + return (1); + } + + return (0); +} + + +/* + * i/o port handling. + */ +static uint8_t +fwctl_inb(void) +{ + uint8_t retval; + + retval = 0xff; + + switch (be_state) { + case IDENT_SEND: + retval = sig[ident_idx++]; + if (ident_idx >= sizeof(sig)) + be_state = REQ; + break; + default: + break; + } + + return (retval); +} + +static void +fwctl_outw(uint16_t val) +{ + switch (be_state) { + case IDENT_WAIT: + if (val == 0) { + be_state = IDENT_SEND; + ident_idx = 0; + } + break; + default: + /* ignore */ + break; + } +} + +static uint32_t +fwctl_inl(void) +{ + uint32_t retval; + + switch (be_state) { + case RESP: + if (fwctl_response(&retval)) + be_state = REQ; + break; + default: + retval = 0xffffffff; + break; + } + + return (retval); +} + +static void +fwctl_outl(uint32_t val) +{ + + switch (be_state) { + case REQ: + if (fwctl_request(val)) + be_state = RESP; + default: + break; + } + +} + +static int +fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (in) { + if (bytes == 1) + *eax = fwctl_inb(); + else if (bytes == 4) + *eax = fwctl_inl(); + else + *eax = 0xffff; + } else { + if (bytes == 2) + fwctl_outw(*eax); + else if (bytes == 4) + fwctl_outl(*eax); + } + + return (0); +} +INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler); +INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler); + +void +fwctl_init(void) +{ + + ops[OP_GET_LEN] = &fgetlen_info; + ops[OP_GET] = &fgetval_info; + + be_state = IDENT_WAIT; +} diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h new file mode 100644 index 0000000000..6dad244811 --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FWCTL_H_ +#define _FWCTL_H_ + +#include <sys/linker_set.h> + +/* + * Linker set api for export of information to guest firmware via + * a sysctl-like OID interface + */ +struct ctl { + const char *c_oid; + const void *c_data; + const int c_len; +}; + +#define CTL_NODE(oid, data, len) \ + static struct ctl __CONCAT(__ctl, __LINE__) = { \ + oid, \ + (data), \ + (len), \ + }; \ + DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__)) + +void fwctl_init(void); + +#endif /* _FWCTL_H_ */ diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c new file mode 100644 index 0000000000..69bcf53c31 --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.c @@ -0,0 +1,1332 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017-2018 John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <machine/atomic.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <netinet/in.h> +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <pthread_np.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "mem.h" +#include "mevent.h" + +/* + * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops + * use SIGTRAP. + */ +#define GDB_SIGNAL_TRAP 5 + +static void gdb_resume_vcpus(void); +static void check_command(int fd); + +static struct mevent *read_event, *write_event; + +static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; +static pthread_mutex_t gdb_lock; +static pthread_cond_t idle_vcpus; +static bool stop_pending, first_stop; +#ifdef __FreeBSD__ +static int stepping_vcpu, stopped_vcpu; +#else +static int stepping_vcpu = -1, stopped_vcpu = -1; +#endif + +/* + * An I/O buffer contains 'capacity' bytes of room at 'data'. For a + * read buffer, 'start' is unused and 'len' contains the number of + * valid bytes in the buffer. For a write buffer, 'start' is set to + * the index of the next byte in 'data' to send, and 'len' contains + * the remaining number of valid bytes to send. + */ +struct io_buffer { + uint8_t *data; + size_t capacity; + size_t start; + size_t len; +}; + +static struct io_buffer cur_comm, cur_resp; +static uint8_t cur_csum; +static int cur_vcpu; +static struct vmctx *ctx; +static int cur_fd = -1; + +const int gdb_regset[] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS +}; + +const int gdb_regsize[] = { + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 4, + 4, + 4, + 4, + 4, + 4, + 4 +}; + +#ifdef GDB_LOG +#include <stdarg.h> +#include <stdio.h> + +static void __printflike(1, 2) +debug(const char *fmt, ...) +{ + static FILE *logfile; + va_list ap; + + if (logfile == NULL) { + logfile = fopen("/tmp/bhyve_gdb.log", "w"); + if (logfile == NULL) + return; +#ifndef WITHOUT_CAPSICUM + if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { + fclose(logfile); + logfile = NULL; + return; + } +#endif + setlinebuf(logfile); + } + va_start(ap, fmt); + vfprintf(logfile, fmt, ap); + va_end(ap); +} +#else +#define debug(...) +#endif + +static int +guest_paging_info(int vcpu, struct vm_guest_paging *paging) +{ + uint64_t regs[4]; + const int regset[4] = { + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_EFER + }; + + if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) + return (-1); + + /* + * For the debugger, always pretend to be the kernel (CPL 0), + * and if long-mode is enabled, always parse addresses as if + * in 64-bit mode. + */ + paging->cr3 = regs[1]; + paging->cpl = 0; + if (regs[3] & EFER_LMA) + paging->cpu_mode = CPU_MODE_64BIT; + else if (regs[0] & CR0_PE) + paging->cpu_mode = CPU_MODE_PROTECTED; + else + paging->cpu_mode = CPU_MODE_REAL; + if (!(regs[0] & CR0_PG)) + paging->paging_mode = PAGING_MODE_FLAT; + else if (!(regs[2] & CR4_PAE)) + paging->paging_mode = PAGING_MODE_32; + else if (regs[3] & EFER_LME) + paging->paging_mode = PAGING_MODE_64; + else + paging->paging_mode = PAGING_MODE_PAE; + return (0); +} + +/* + * Map a guest virtual address to a physical address (for a given vcpu). + * If a guest virtual address is valid, return 1. If the address is + * not valid, return 0. If an error occurs obtaining the mapping, + * return -1. + */ +static int +guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) +{ + struct vm_guest_paging paging; + int fault; + + if (guest_paging_info(vcpu, &paging) == -1) + return (-1); + + /* + * Always use PROT_READ. We really care if the VA is + * accessible, not if the current vCPU can write. + */ + if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, + &fault) == -1) + return (-1); + if (fault) + return (0); + return (1); +} + +static void +io_buffer_reset(struct io_buffer *io) +{ + + io->start = 0; + io->len = 0; +} + +/* Available room for adding data. */ +static size_t +io_buffer_avail(struct io_buffer *io) +{ + + return (io->capacity - (io->start + io->len)); +} + +static uint8_t * +io_buffer_head(struct io_buffer *io) +{ + + return (io->data + io->start); +} + +static uint8_t * +io_buffer_tail(struct io_buffer *io) +{ + + return (io->data + io->start + io->len); +} + +static void +io_buffer_advance(struct io_buffer *io, size_t amount) +{ + + assert(amount <= io->len); + io->start += amount; + io->len -= amount; +} + +static void +io_buffer_consume(struct io_buffer *io, size_t amount) +{ + + io_buffer_advance(io, amount); + if (io->len == 0) { + io->start = 0; + return; + } + + /* + * XXX: Consider making this move optional and compacting on a + * future read() before realloc(). + */ + memmove(io->data, io_buffer_head(io), io->len); + io->start = 0; +} + +static void +io_buffer_grow(struct io_buffer *io, size_t newsize) +{ + uint8_t *new_data; + size_t avail, new_cap; + + avail = io_buffer_avail(io); + if (newsize <= avail) + return; + + new_cap = io->capacity + (newsize - avail); + new_data = realloc(io->data, new_cap); + if (new_data == NULL) + err(1, "Failed to grow GDB I/O buffer"); + io->data = new_data; + io->capacity = new_cap; +} + +static bool +response_pending(void) +{ + + if (cur_resp.start == 0 && cur_resp.len == 0) + return (false); + if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') + return (false); + return (true); +} + +static void +close_connection(void) +{ + + /* + * XXX: This triggers a warning because mevent does the close + * before the EV_DELETE. + */ + pthread_mutex_lock(&gdb_lock); + mevent_delete(write_event); + mevent_delete_close(read_event); + write_event = NULL; + read_event = NULL; + io_buffer_reset(&cur_comm); + io_buffer_reset(&cur_resp); + cur_fd = -1; + + /* Resume any stopped vCPUs. */ + gdb_resume_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +static uint8_t +hex_digit(uint8_t nibble) +{ + + if (nibble <= 9) + return (nibble + '0'); + else + return (nibble + 'a' - 10); +} + +static uint8_t +parse_digit(uint8_t v) +{ + + if (v >= '0' && v <= '9') + return (v - '0'); + if (v >= 'a' && v <= 'f') + return (v - 'a' + 10); + if (v >= 'A' && v <= 'F') + return (v - 'A' + 10); + return (0xF); +} + +/* Parses big-endian hexadecimal. */ +static uintmax_t +parse_integer(const uint8_t *p, size_t len) +{ + uintmax_t v; + + v = 0; + while (len > 0) { + v <<= 4; + v |= parse_digit(*p); + p++; + len--; + } + return (v); +} + +static uint8_t +parse_byte(const uint8_t *p) +{ + + return (parse_digit(p[0]) << 4 | parse_digit(p[1])); +} + +static void +send_pending_data(int fd) +{ + ssize_t nwritten; + + if (cur_resp.len == 0) { + mevent_disable(write_event); + return; + } + nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); + if (nwritten == -1) { + warn("Write to GDB socket failed"); + close_connection(); + } else { + io_buffer_advance(&cur_resp, nwritten); + if (cur_resp.len == 0) + mevent_disable(write_event); + else + mevent_enable(write_event); + } +} + +/* Append a single character to the output buffer. */ +static void +send_char(uint8_t data) +{ + io_buffer_grow(&cur_resp, 1); + *io_buffer_tail(&cur_resp) = data; + cur_resp.len++; +} + +/* Append an array of bytes to the output buffer. */ +static void +send_data(const uint8_t *data, size_t len) +{ + + io_buffer_grow(&cur_resp, len); + memcpy(io_buffer_tail(&cur_resp), data, len); + cur_resp.len += len; +} + +static void +format_byte(uint8_t v, uint8_t *buf) +{ + + buf[0] = hex_digit(v >> 4); + buf[1] = hex_digit(v & 0xf); +} + +/* + * Append a single byte (formatted as two hex characters) to the + * output buffer. + */ +static void +send_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + send_data(buf, sizeof(buf)); +} + +static void +start_packet(void) +{ + + send_char('$'); + cur_csum = 0; +} + +static void +finish_packet(void) +{ + + send_char('#'); + send_byte(cur_csum); + debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); +} + +/* + * Append a single character (for the packet payload) and update the + * checksum. + */ +static void +append_char(uint8_t v) +{ + + send_char(v); + cur_csum += v; +} + +/* + * Append an array of bytes (for the packet payload) and update the + * checksum. + */ +static void +append_packet_data(const uint8_t *data, size_t len) +{ + + send_data(data, len); + while (len > 0) { + cur_csum += *data; + data++; + len--; + } +} + +static void +append_string(const char *str) +{ + +#ifdef __FreeBSD__ + append_packet_data(str, strlen(str)); +#else + append_packet_data((const uint8_t *)str, strlen(str)); +#endif +} + +static void +append_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + append_packet_data(buf, sizeof(buf)); +} + +static void +append_unsigned_native(uintmax_t value, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + append_byte(value); + value >>= 8; + } +} + +static void +append_unsigned_be(uintmax_t value, size_t len) +{ + char buf[len * 2]; + size_t i; + + for (i = 0; i < len; i++) { +#ifdef __FreeBSD__ + format_byte(value, buf + (len - i - 1) * 2); +#else + format_byte(value, (uint8_t *)(buf + (len - i - 1) * 2)); +#endif + value >>= 8; + } +#ifdef __FreeBSD__ + append_packet_data(buf, sizeof(buf)); +#else + append_packet_data((const uint8_t *)buf, sizeof(buf)); +#endif +} + +static void +append_integer(unsigned int value) +{ + + if (value == 0) + append_char('0'); + else + append_unsigned_be(value, fls(value) + 7 / 8); +} + +static void +append_asciihex(const char *str) +{ + + while (*str != '\0') { + append_byte(*str); + str++; + } +} + +static void +send_empty_response(void) +{ + + start_packet(); + finish_packet(); +} + +static void +send_error(int error) +{ + + start_packet(); + append_char('E'); + append_byte(error); + finish_packet(); +} + +static void +send_ok(void) +{ + + start_packet(); + append_string("OK"); + finish_packet(); +} + +static int +parse_threadid(const uint8_t *data, size_t len) +{ + + if (len == 1 && *data == '0') + return (0); + if (len == 2 && memcmp(data, "-1", 2) == 0) + return (-1); + if (len == 0) + return (-2); + return (parse_integer(data, len)); +} + +static void +report_stop(void) +{ + + start_packet(); + if (stopped_vcpu == -1) + append_char('S'); + else + append_char('T'); + append_byte(GDB_SIGNAL_TRAP); + if (stopped_vcpu != -1) { + append_string("thread:"); + append_integer(stopped_vcpu + 1); + append_char(';'); + } + stopped_vcpu = -1; + finish_packet(); +} + +static void +gdb_finish_suspend_vcpus(void) +{ + + if (first_stop) { + first_stop = false; + stopped_vcpu = -1; + } else if (response_pending()) + stop_pending = true; + else { + report_stop(); + send_pending_data(cur_fd); + } +} + +static void +_gdb_cpu_suspend(int vcpu, bool report_stop) +{ + + debug("$vCPU %d suspending\n", vcpu); + CPU_SET(vcpu, &vcpus_waiting); + if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); + while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu) + pthread_cond_wait(&idle_vcpus, &gdb_lock); + CPU_CLR(vcpu, &vcpus_waiting); + debug("$vCPU %d resuming\n", vcpu); +} + +void +gdb_cpu_add(int vcpu) +{ + + debug("$vCPU %d starting\n", vcpu); + pthread_mutex_lock(&gdb_lock); + CPU_SET(vcpu, &vcpus_active); + + /* + * If a vcpu is added while vcpus are stopped, suspend the new + * vcpu so that it will pop back out with a debug exit before + * executing the first instruction. + */ + if (!CPU_EMPTY(&vcpus_suspended)) { + CPU_SET(vcpu, &vcpus_suspended); + _gdb_cpu_suspend(vcpu, false); + } + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_suspend(int vcpu) +{ + + pthread_mutex_lock(&gdb_lock); + _gdb_cpu_suspend(vcpu, true); + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_mtrap(int vcpu) +{ + + debug("$vCPU %d MTRAP\n", vcpu); + pthread_mutex_lock(&gdb_lock); + if (vcpu == stepping_vcpu) { + stepping_vcpu = -1; + vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); + vm_suspend_cpu(ctx, vcpu); + assert(stopped_vcpu == -1); + stopped_vcpu = vcpu; + _gdb_cpu_suspend(vcpu, true); + } + pthread_mutex_unlock(&gdb_lock); +} + +static void +gdb_suspend_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + debug("suspending all CPUs\n"); + vcpus_suspended = vcpus_active; + vm_suspend_cpu(ctx, -1); + if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); +} + +static bool +gdb_step_vcpu(int vcpu) +{ + int error, val; + + debug("$vCPU %d step\n", vcpu); + error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); + if (error < 0) + return (false); + error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + vm_resume_cpu(ctx, vcpu); + stepping_vcpu = vcpu; + pthread_cond_broadcast(&idle_vcpus); + return (true); +} + +static void +gdb_resume_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + vm_resume_cpu(ctx, -1); + debug("resuming all CPUs\n"); + CPU_ZERO(&vcpus_suspended); + pthread_cond_broadcast(&idle_vcpus); +} + +static void +gdb_read_regs(void) +{ + uint64_t regvals[nitems(gdb_regset)]; + int i; + + if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), + gdb_regset, regvals) == -1) { + send_error(errno); + return; + } + start_packet(); + for (i = 0; i < nitems(regvals); i++) + append_unsigned_native(regvals[i], gdb_regsize[i]); + finish_packet(); +} + +static void +gdb_read_mem(const uint8_t *data, size_t len) +{ + uint64_t gpa, gva, val; + uint8_t *cp; + size_t resid, todo, bytes; + bool started; + int error; + + cp = memchr(data, ',', len); + if (cp == NULL) { + send_error(EINVAL); + return; + } + gva = parse_integer(data + 1, cp - (data + 1)); + resid = parse_integer(cp + 1, len - (cp + 1 - data)); + started = false; + + while (resid > 0) { + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + if (started) + finish_packet(); + else + send_error(errno); + return; + } + if (error == 0) { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + + /* Read bytes from current page. */ + todo = getpagesize() - gpa % getpagesize(); + if (todo > resid) + todo = resid; + + cp = paddr_guest2host(ctx, gpa, todo); + if (cp != NULL) { + /* + * If this page is guest RAM, read it a byte + * at a time. + */ + if (!started) { + start_packet(); + started = true; + } + while (todo > 0) { + append_byte(*cp); + cp++; + gpa++; + gva++; + resid--; + todo--; + } + } else { + /* + * If this page isn't guest RAM, try to handle + * it via MMIO. For MMIO requests, use + * aligned reads of words when possible. + */ + while (todo > 0) { + if (gpa & 1 || todo == 1) + bytes = 1; + else if (gpa & 2 || todo == 2) + bytes = 2; + else + bytes = 4; + error = read_mem(ctx, cur_vcpu, gpa, &val, + bytes); + if (error == 0) { + if (!started) { + start_packet(); + started = true; + } + gpa += bytes; + gva += bytes; + resid -= bytes; + todo -= bytes; + while (bytes > 0) { + append_byte(val); + val >>= 8; + bytes--; + } + } else { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + } + } + assert(resid == 0 || gpa % getpagesize() == 0); + } + if (!started) + start_packet(); + finish_packet(); +} + +static bool +command_equals(const uint8_t *data, size_t len, const char *cmd) +{ + + if (strlen(cmd) > len) + return (false); + return (memcmp(data, cmd, strlen(cmd)) == 0); +} + +static void +gdb_query(const uint8_t *data, size_t len) +{ + + /* + * TODO: + * - qSearch + * - qSupported + */ + if (command_equals(data, len, "qAttached")) { + start_packet(); + append_char('1'); + finish_packet(); + } else if (command_equals(data, len, "qC")) { + start_packet(); + append_string("QC"); + append_integer(cur_vcpu + 1); + finish_packet(); + } else if (command_equals(data, len, "qfThreadInfo")) { + cpuset_t mask; + bool first; + int vcpu; + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + return; + } + mask = vcpus_active; + start_packet(); + append_char('m'); + first = true; + while (!CPU_EMPTY(&mask)) { + vcpu = CPU_FFS(&mask) - 1; + CPU_CLR(vcpu, &mask); + if (first) + first = false; + else + append_char(','); + append_integer(vcpu + 1); + } + finish_packet(); + } else if (command_equals(data, len, "qsThreadInfo")) { + start_packet(); + append_char('l'); + finish_packet(); + } else if (command_equals(data, len, "qThreadExtraInfo")) { + char buf[16]; + int tid; + + data += strlen("qThreadExtraInfo"); + len -= strlen("qThreadExtraInfo"); + if (*data != ',') { + send_error(EINVAL); + return; + } + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + + snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); + start_packet(); + append_asciihex(buf); + finish_packet(); + } else + send_empty_response(); +} + +static void +handle_command(const uint8_t *data, size_t len) +{ + + /* Reject packets with a sequence-id. */ + if (len >= 3 && data[0] >= '0' && data[0] <= '9' && + data[0] >= '0' && data[0] <= '9' && data[2] == ':') { + send_empty_response(); + return; + } + + switch (*data) { + case 'c': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + gdb_resume_vcpus(); + break; + case 'D': + send_ok(); + + /* TODO: Resume any stopped CPUs. */ + break; + case 'g': { + gdb_read_regs(); + break; + } + case 'H': { + int tid; + + if (data[1] != 'g' && data[1] != 'c') { + send_error(EINVAL); + break; + } + tid = parse_threadid(data + 2, len - 2); + if (tid == -2) { + send_error(EINVAL); + break; + } + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + break; + } + if (tid == -1 || tid == 0) + cur_vcpu = CPU_FFS(&vcpus_active) - 1; + else if (CPU_ISSET(tid - 1, &vcpus_active)) + cur_vcpu = tid - 1; + else { + send_error(EINVAL); + break; + } + send_ok(); + break; + } + case 'm': + gdb_read_mem(data, len); + break; + case 'T': { + int tid; + + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + send_ok(); + break; + } + case 'q': + gdb_query(data, len); + break; + case 's': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + if (!gdb_step_vcpu(cur_vcpu)) { + send_error(EOPNOTSUPP); + break; + } + break; + case '?': + /* XXX: Only if stopped? */ + /* For now, just report that we are always stopped. */ + start_packet(); + append_char('S'); + append_byte(GDB_SIGNAL_TRAP); + finish_packet(); + break; + case 'G': /* TODO */ + case 'M': /* TODO */ + case 'v': + /* Handle 'vCont' */ + /* 'vCtrlC' */ + case 'p': /* TODO */ + case 'P': /* TODO */ + case 'Q': /* TODO */ + case 't': /* TODO */ + case 'X': /* TODO */ + case 'z': /* TODO */ + case 'Z': /* TODO */ + default: + send_empty_response(); + } +} + +/* Check for a valid packet in the command buffer. */ +static void +check_command(int fd) +{ + uint8_t *head, *hash, *p, sum; + size_t avail, plen; + + for (;;) { + avail = cur_comm.len; + if (avail == 0) + return; + head = io_buffer_head(&cur_comm); + switch (*head) { + case 0x03: + debug("<- Ctrl-C\n"); + io_buffer_consume(&cur_comm, 1); + + gdb_suspend_vcpus(); + break; + case '+': + /* ACK of previous response. */ + debug("<- +\n"); + if (response_pending()) + io_buffer_reset(&cur_resp); + io_buffer_consume(&cur_comm, 1); + if (stop_pending) { + stop_pending = false; + report_stop(); + send_pending_data(fd); + } + break; + case '-': + /* NACK of previous response. */ + debug("<- -\n"); + if (response_pending()) { + cur_resp.len += cur_resp.start; + cur_resp.start = 0; + if (cur_resp.data[0] == '+') + io_buffer_advance(&cur_resp, 1); + debug("-> %.*s\n", (int)cur_resp.len, + io_buffer_head(&cur_resp)); + } + io_buffer_consume(&cur_comm, 1); + send_pending_data(fd); + break; + case '$': + /* Packet. */ + + if (response_pending()) { + warnx("New GDB command while response in " + "progress"); + io_buffer_reset(&cur_resp); + } + + /* Is packet complete? */ + hash = memchr(head, '#', avail); + if (hash == NULL) + return; + plen = (hash - head + 1) + 2; + if (avail < plen) + return; + debug("<- %.*s\n", (int)plen, head); + + /* Verify checksum. */ + for (sum = 0, p = head + 1; p < hash; p++) + sum += *p; + if (sum != parse_byte(hash + 1)) { + io_buffer_consume(&cur_comm, plen); + debug("-> -\n"); + send_char('-'); + send_pending_data(fd); + break; + } + send_char('+'); + + handle_command(head + 1, hash - (head + 1)); + io_buffer_consume(&cur_comm, plen); + if (!response_pending()) { + debug("-> +\n"); + } + send_pending_data(fd); + break; + default: + /* XXX: Possibly drop connection instead. */ + debug("-> %02x\n", *head); + io_buffer_consume(&cur_comm, 1); + break; + } + } +} + +static void +gdb_readable(int fd, enum ev_type event, void *arg) +{ + ssize_t nread; + int pending; + + if (ioctl(fd, FIONREAD, &pending) == -1) { + warn("FIONREAD on GDB socket"); + return; + } + + /* + * 'pending' might be zero due to EOF. We need to call read + * with a non-zero length to detect EOF. + */ + if (pending == 0) + pending = 1; + + /* Ensure there is room in the command buffer. */ + io_buffer_grow(&cur_comm, pending); + assert(io_buffer_avail(&cur_comm) >= pending); + + nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); + if (nread == 0) { + close_connection(); + } else if (nread == -1) { + if (errno == EAGAIN) + return; + + warn("Read from GDB socket"); + close_connection(); + } else { + cur_comm.len += nread; + pthread_mutex_lock(&gdb_lock); + check_command(fd); + pthread_mutex_unlock(&gdb_lock); + } +} + +static void +gdb_writable(int fd, enum ev_type event, void *arg) +{ + + send_pending_data(fd); +} + +static void +new_connection(int fd, enum ev_type event, void *arg) +{ + int optval, s; + + s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); + if (s == -1) { + if (arg != NULL) + err(1, "Failed accepting initial GDB connection"); + + /* Silently ignore errors post-startup. */ + return; + } + + optval = 1; + if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == + -1) { + warn("Failed to disable SIGPIPE for GDB connection"); + close(s); + return; + } + + pthread_mutex_lock(&gdb_lock); + if (cur_fd != -1) { + close(s); + warnx("Ignoring additional GDB connection."); + } + + read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); + if (read_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + pthread_mutex_unlock(&gdb_lock); + return; + } + write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); + if (write_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + mevent_delete_close(read_event); + read_event = NULL; + } + + cur_fd = s; + cur_vcpu = 0; + stepping_vcpu = -1; + stopped_vcpu = -1; + stop_pending = false; + + /* Break on attach. */ + first_stop = true; + gdb_suspend_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +#ifndef WITHOUT_CAPSICUM +void +limit_gdb_socket(int s) +{ + cap_rights_t rights; + unsigned long ioctls[] = { FIONREAD }; + + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, + CAP_SETSOCKOPT, CAP_IOCTL); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} +#endif + +void +init_gdb(struct vmctx *_ctx, int sport, bool wait) +{ + struct sockaddr_in sin; + int error, flags, s; + + debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not "); + + error = pthread_mutex_init(&gdb_lock, NULL); + if (error != 0) + errc(1, error, "gdb mutex init"); + error = pthread_cond_init(&idle_vcpus, NULL); + if (error != 0) + errc(1, error, "gdb cv init"); + + ctx = _ctx; + s = socket(PF_INET, SOCK_STREAM, 0); + if (s < 0) + err(1, "gdb socket create"); + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) + err(1, "gdb socket bind"); + + if (listen(s, 1) < 0) + err(1, "gdb socket listen"); + + if (wait) { + /* + * Set vcpu 0 in vcpus_suspended. This will trigger the + * logic in gdb_cpu_add() to suspend the first vcpu before + * it starts execution. The vcpu will remain suspended + * until a debugger connects. + */ + stepping_vcpu = -1; + stopped_vcpu = -1; + CPU_SET(0, &vcpus_suspended); + } + + flags = fcntl(s, F_GETFL); + if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) + err(1, "Failed to mark gdb socket non-blocking"); + +#ifndef WITHOUT_CAPSICUM + limit_gdb_socket(s); +#endif + mevent_add(s, EVF_READ, new_connection, NULL); +} diff --git a/usr/src/cmd/bhyve/gdb.h b/usr/src/cmd/bhyve/gdb.h new file mode 100644 index 0000000000..fa2184df16 --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __GDB_H__ +#define __GDB_H__ + +void gdb_cpu_add(int vcpu); +void gdb_cpu_mtrap(int vcpu); +void gdb_cpu_suspend(int vcpu); +void init_gdb(struct vmctx *ctx, int sport, bool wait); + +#endif /* !__GDB_H__ */ diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c new file mode 100644 index 0000000000..b460ee2988 --- /dev/null +++ b/usr/src/cmd/bhyve/inout.c @@ -0,0 +1,299 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "bhyverun.h" +#include "inout.h" + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +#define VERIFY_IOPORT(port, size) \ + assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) + +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; + +static int +default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); +} + +static void +register_default_iohandler(int start, int size) +{ + struct inout_port iop; + + VERIFY_IOPORT(start, size); + + bzero(&iop, sizeof(iop)); + iop.name = "default"; + iop.port = start; + iop.size = size; + iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; + iop.handler = default_inout; + + register_inout(&iop); +} + +int +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) +{ + int addrsize, bytes, flags, in, port, prot, rep; + uint32_t eax, val; + inout_func_t handler; + void *arg; + int error, fault, retval; + enum vm_reg_name idxreg; + uint64_t gla, index, iterations, count; + struct vm_inout_str *vis; + struct iovec iov[2]; + + bytes = vmexit->u.inout.bytes; + in = vmexit->u.inout.in; + port = vmexit->u.inout.port; + + assert(port < MAX_IOPORTS); + assert(bytes == 1 || bytes == 2 || bytes == 4); + + handler = inout_handlers[port].handler; + + if (strict && handler == default_inout) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if (in) { + if (!(flags & IOPORT_F_IN)) + return (-1); + } else { + if (!(flags & IOPORT_F_OUT)) + return (-1); + } + + retval = 0; + if (vmexit->u.inout.string) { + vis = &vmexit->u.inout_str; + rep = vis->inout.rep; + addrsize = vis->addrsize; + prot = in ? PROT_WRITE : PROT_READ; + assert(addrsize == 2 || addrsize == 4 || addrsize == 8); + + /* Index register */ + idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + index = vis->index & vie_size2mask(addrsize); + + /* Count register */ + count = vis->count & vie_size2mask(addrsize); + + /* Limit number of back-to-back in/out emulations to 16 */ + iterations = MIN(count, 16); + while (iterations > 0) { + assert(retval == 0); + if (vie_calculate_gla(vis->paging.cpu_mode, + vis->seg_name, &vis->seg_desc, index, bytes, + addrsize, prot, &gla)) { + vm_inject_gp(ctx, vcpu); + break; + } + + error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, + bytes, prot, iov, nitems(iov), &fault); + if (error) { + retval = -1; /* Unrecoverable error */ + break; + } else if (fault) { + retval = 0; /* Resume guest to handle fault */ + break; + } + + if (vie_alignment_check(vis->paging.cpl, bytes, + vis->cr0, vis->rflags, gla)) { + vm_inject_ac(ctx, vcpu, 0); + break; + } + + val = 0; + if (!in) + vm_copyin(ctx, vcpu, iov, &val, bytes); + + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval != 0) + break; + + if (in) + vm_copyout(ctx, vcpu, &val, iov, bytes); + + /* Update index */ + if (vis->rflags & PSL_D) + index -= bytes; + else + index += bytes; + + count--; + iterations--; + } + + /* Update index register */ + error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); + assert(error == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if (rep) { + error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, + count, addrsize); + assert(error == 0); + } + + /* Restart the instruction if more iterations remain */ + if (retval == 0 && count != 0) { + error = vm_restart_instruction(ctx, vcpu); + assert(error == 0); + } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval == 0 && in) { + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + eax); + assert(error == 0); + } + } + return (retval); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + /* + * Set up the default handler for all ports + */ + register_default_iohandler(0, MAX_IOPORTS); + + /* + * Overwrite with specified handlers + */ + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + int i; + + VERIFY_IOPORT(iop->port, iop->size); + + /* + * Verify that the new registration is not overwriting an already + * allocated i/o range. + */ + if ((iop->flags & IOPORT_F_DEFAULT) == 0) { + for (i = iop->port; i < iop->port + iop->size; i++) { + if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) + return (-1); + } + } + + for (i = iop->port; i < iop->port + iop->size; i++) { + inout_handlers[i].name = iop->name; + inout_handlers[i].flags = iop->flags; + inout_handlers[i].handler = iop->handler; + inout_handlers[i].arg = iop->arg; + } + + return (0); +} + +int +unregister_inout(struct inout_port *iop) +{ + + VERIFY_IOPORT(iop->port, iop->size); + assert(inout_handlers[iop->port].name == iop->name); + + register_default_iohandler(iop->port, iop->size); + + return (0); +} diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h new file mode 100644 index 0000000000..b72ee5d93e --- /dev/null +++ b/usr/src/cmd/bhyve/inout.h @@ -0,0 +1,93 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _INOUT_H_ +#define _INOUT_H_ + +#include <sys/linker_set.h> + +struct vmctx; +struct vm_exit; + +/* + * inout emulation handlers return 0 on success and -1 on failure. + */ +typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +struct inout_port { + const char *name; + int port; + int size; + int flags; + inout_func_t handler; + void *arg; +}; +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) + +/* + * The following flags are used internally and must not be used by + * device models. + */ +#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ + #name, \ + (port), \ + 1, \ + (flags), \ + (handler), \ + 0 \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) + +void init_inout(void); +int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, + int strict); +int register_inout(struct inout_port *iop); +int unregister_inout(struct inout_port *iop); +void init_bvmcons(void); + +#endif /* _INOUT_H_ */ diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c new file mode 100644 index 0000000000..acdbb5111b --- /dev/null +++ b/usr/src/cmd/bhyve/ioapic.c @@ -0,0 +1,83 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <stdio.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "ioapic.h" +#include "pci_emul.h" +#include "pci_lpc.h" + +/* + * Assign PCI INTx interrupts to I/O APIC pins in a round-robin + * fashion. Note that we have no idea what the HPET is using, but the + * HPET is also programmable whereas this is intended for hardwired + * PCI interrupts. + * + * This assumes a single I/O APIC where pins >= 16 are permitted for + * PCI devices. + */ +static int pci_pins; + +void +ioapic_init(struct vmctx *ctx) +{ + + if (vm_ioapic_pincount(ctx, &pci_pins) < 0) { + pci_pins = 0; + return; + } + + /* Ignore the first 16 pins. */ + if (pci_pins <= 16) { + pci_pins = 0; + return; + } + pci_pins -= 16; +} + +int +ioapic_pci_alloc_irq(struct pci_devinst *pi) +{ + static int last_pin; + + if (pci_pins == 0) + return (-1); + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8); + } + return (16 + (last_pin++ % pci_pins)); +} diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h new file mode 100644 index 0000000000..3a7fa76192 --- /dev/null +++ b/usr/src/cmd/bhyve/ioapic.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOAPIC_H_ +#define _IOAPIC_H_ + +struct pci_devinst; + +/* + * Allocate a PCI IRQ from the I/O APIC. + */ +void ioapic_init(struct vmctx *ctx); +int ioapic_pci_alloc_irq(struct pci_devinst *pi); + +#endif diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c new file mode 100644 index 0000000000..54ea22aa94 --- /dev/null +++ b/usr/src/cmd/bhyve/iov.c @@ -0,0 +1,148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/uio.h> + +#include <stdlib.h> +#include <string.h> +#include "iov.h" + +void +seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2, + size_t seek) +{ + size_t remainder = 0; + size_t left = seek; + int i, j; + + for (i = 0; i < niov1; i++) { + size_t toseek = MIN(left, iov1[i].iov_len); + left -= toseek; + + if (toseek == iov1[i].iov_len) + continue; + + if (left == 0) { + remainder = toseek; + break; + } + } + + for (j = i; j < niov1; j++) { + iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder; + iov2[j - i].iov_len = iov1[j].iov_len - remainder; + remainder = 0; + } + + *niov2 = j - i; +} + +size_t +count_iov(const struct iovec *iov, int niov) +{ + size_t total = 0; + int i; + + for (i = 0; i < niov; i++) + total += iov[i].iov_len; + + return (total); +} + +void +truncate_iov(struct iovec *iov, int *niov, size_t length) +{ + size_t done = 0; + int i; + + for (i = 0; i < *niov; i++) { + size_t toseek = MIN(length - done, iov[i].iov_len); + done += toseek; + + if (toseek <= iov[i].iov_len) { + iov[i].iov_len = toseek; + *niov = i + 1; + return; + } + } +} + +ssize_t +iov_to_buf(const struct iovec *iov, int niov, void **buf) +{ + size_t ptr, total; + int i; + + total = count_iov(iov, niov); + *buf = realloc(*buf, total); + if (*buf == NULL) + return (-1); + + for (i = 0, ptr = 0; i < niov; i++) { + memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len); + ptr += iov[i].iov_len; + } + + return (total); +} + +ssize_t +buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek) +{ + struct iovec *diov; + int ndiov, i; + size_t off = 0, len; + + if (seek > 0) { + diov = malloc(sizeof(struct iovec) * niov); + seek_iov(iov, niov, diov, &ndiov, seek); + } else { + diov = iov; + ndiov = niov; + } + + for (i = 0; i < ndiov && off < buflen; i++) { + len = MIN(diov[i].iov_len, buflen - off); + memcpy(diov[i].iov_base, buf + off, len); + off += len; + } + + if (seek > 0) + free(diov); + + return ((ssize_t)off); +} + diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h new file mode 100644 index 0000000000..e3b5916edb --- /dev/null +++ b/usr/src/cmd/bhyve/iov.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOV_H_ +#define _IOV_H_ + +void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, + int *niov2, size_t seek); +void truncate_iov(struct iovec *iov, int *niov, size_t length); +size_t count_iov(const struct iovec *iov, int niov); +ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf); +ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek); + +#endif /* _IOV_H_ */ diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c new file mode 100644 index 0000000000..85e56af10b --- /dev/null +++ b/usr/src/cmd/bhyve/mem.c @@ -0,0 +1,361 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/tree.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <assert.h> +#include <err.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> + +#include "mem.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + int perror; + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa, + struct mem_range *mr, void *arg); + +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + +static int +access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, + void *arg) +{ + struct mmio_rb_range *entry; + int err, perror, immutable; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + return (ESRCH); + } + } + + assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } + + err = cb(ctx, vcpu, paddr, &entry->mr_param, arg); + + if (!immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } + + + return (err); +} + +struct emulate_mem_args { + struct vie *vie; + struct vm_guest_paging *paging; +}; + +static int +emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct emulate_mem_args *ema; + + ema = arg; + return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging, + mem_read, mem_write, mr)); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct emulate_mem_args ema; + + ema.vie = vie; + ema.paging = paging; + return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema)); +} + +struct read_mem_args { + uint64_t *rval; + int size; +}; + +static int +read_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct read_mem_args *rma; + + rma = arg; + return (mr->handler(ctx, vcpu, MEM_F_READ, paddr, rma->size, + rma->rval, mr->arg1, mr->arg2)); +} + +int +read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size) +{ + struct read_mem_args rma; + + rma.rval = rval; + rma.size = size; + return (access_memory(ctx, vcpu, gpa, read_mem_cb, &rma)); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err, perror; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + if (mrp == NULL) { + warn("%s: couldn't allocate memory for mrp\n", + __func__); + err = ENOMEM; + } else { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + if (err) + free(mrp); + } + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, perror, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h new file mode 100644 index 0000000000..596c0b0cf3 --- /dev/null +++ b/usr/src/cmd/bhyve/mem.h @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include <sys/linker_set.h> + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging); + +int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, + int size); +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); + +#endif /* _MEM_H_ */ diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c new file mode 100644 index 0000000000..a258fd3047 --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.c @@ -0,0 +1,680 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#ifdef __FreeBSD__ +#include <sys/event.h> +#else +#include <port.h> +#include <sys/poll.h> +#include <sys/siginfo.h> +#include <sys/queue.h> +#endif +#include <sys/time.h> + +#include <pthread.h> +#include <pthread_np.h> + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ADD 1 +#define MEV_ENABLE 2 +#define MEV_DISABLE 3 +#define MEV_DEL_PENDING 4 + +extern char *vmname; + +static pthread_t mevent_tid; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); +#define me_msecs me_fd + int me_fd; +#ifdef __FreeBSD__ + int me_timid; +#else + timer_t me_timid; +#endif + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; +#ifndef __FreeBSD__ + port_notify_t me_notify; + struct sigevent me_sigev; + boolean_t me_auto_requeue; +#endif + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} +#ifdef __FreeBSD__ +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ADD: + ret = EV_ADD; /* implicitly enabled */ + break; + case MEV_ENABLE: + ret = EV_ENABLE; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + default: + assert(0); + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + if (mevp->me_type == EVF_TIMER) { + kev[i].ident = mevp->me_timid; + kev[i].data = mevp->me_msecs; + } else { + kev[i].ident = mevp->me_fd; + kev[i].data = 0; + } + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +#else /* __FreeBSD__ */ + +static void +mevent_update_one(struct mevent *mevp) +{ + int portfd = mevp->me_notify.portnfy_port; + + switch (mevp->me_type) { + case EVF_READ: + case EVF_WRITE: + mevp->me_auto_requeue = B_FALSE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + int events; + + events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; + + if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, + events, mevp) != 0) { + (void) fprintf(stderr, + "port_associate fd %d %p failed: %s\n", + mevp->me_fd, mevp, strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + /* + * A disable that comes in while an event is being + * handled will result in an ENOENT. + */ + if (port_dissociate(portfd, PORT_SOURCE_FD, + mevp->me_fd) != 0 && errno != ENOENT) { + (void) fprintf(stderr, "port_dissociate " + "portfd %d fd %d mevp %p failed: %s\n", + portfd, mevp->me_fd, mevp, strerror(errno)); + } + return; + default: + goto abort; + } + + case EVF_TIMER: + mevp->me_auto_requeue = B_TRUE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + struct itimerspec it = { 0 }; + + mevp->me_sigev.sigev_notify = SIGEV_PORT; + mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; + + if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, + &mevp->me_timid) != 0) { + (void) fprintf(stderr, + "timer_create failed: %s", strerror(errno)); + return; + } + + /* The first timeout */ + it.it_value.tv_sec = mevp->me_msecs / MILLISEC; + it.it_value.tv_nsec = + MSEC2NSEC(mevp->me_msecs % MILLISEC); + /* Repeat at the same interval */ + it.it_interval = it.it_value; + + if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { + (void) fprintf(stderr, "timer_settime failed: " + "%s", strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + if (timer_delete(mevp->me_timid) != 0) { + (void) fprintf(stderr, "timer_delete failed: " + "%s", strerror(errno)); + } + return; + default: + goto abort; + } + default: + /* EVF_SIGNAL not yet implemented. */ + goto abort; + } + +abort: + (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, + mevp->me_type, mevp->me_state); + abort(); +} + +static void +mevent_update_pending(int portfd) +{ + struct mevent *mevp, *tmpp; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + mevp->me_notify.portnfy_port = portfd; + mevp->me_notify.portnfy_user = mevp; + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + (void) close(mevp->me_fd); + mevp->me_fd = -1; + } else { + mevent_update_one(mevp); + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + } + + mevent_qunlock(); +} + +static void +mevent_handle_pe(port_event_t *pe) +{ + struct mevent *mevp = pe->portev_user; + + mevent_qunlock(); + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + + mevent_qlock(); + if (!mevp->me_cq && !mevp->me_auto_requeue) { + mevent_update_one(mevp); + } + mevent_qunlock(); +} +#endif + +struct mevent * +mevent_add(int tfd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ADD; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ + + pthread_set_name_np(mevent_tid, "mevent"); +} + +void +mevent_dispatch(void) +{ +#ifdef __FreeBSD__ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; +#else + struct mevent *pipev; + int portfd; +#endif + int ret; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + mevent_tid = pthread_self(); + mevent_set_name(); + +#ifdef __FreeBSD__ + mfd = kqueue(); + assert(mfd > 0); +#else + portfd = port_create(); + assert(portfd >= 0); +#endif + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_KQUEUE); + if (caph_rights_limit(mfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { +#ifdef __FreeBSD__ + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + +#else /* __FreeBSD__ */ + port_event_t pev; + + /* Handle any pending updates */ + mevent_update_pending(portfd); + + /* Block awaiting events */ + ret = port_get(portfd, &pev, NULL); + if (ret != 0 && errno != EINTR) { + perror("Error return from port_get"); + continue; + } + + /* Handle reported event */ + mevent_handle_pe(&pev); +#endif /* __FreeBSD__ */ + } +} diff --git a/usr/src/cmd/bhyve/mevent.h b/usr/src/cmd/bhyve/mevent.h new file mode 100644 index 0000000000..e6b96f0a7c --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c new file mode 100644 index 0000000000..4da3adb5ae --- /dev/null +++ b/usr/src/cmd/bhyve/mevent_test.c @@ -0,0 +1,282 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include <sys/types.h> +#include <sys/stdint.h> +#ifdef __FreeBSD__ +#include <sys/sysctl.h> +#endif +#include <sys/socket.h> +#include <netinet/in.h> +#ifdef __FreeBSD__ +#include <machine/cpufunc.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <unistd.h> + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum; +#ifdef __FreeBSD__ + uint64_t tsc_freq; + size_t len; +#endif + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + +#ifdef __FreeBSD__ + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); +#endif + + for (j = 1; j < TEVSZ; j++) { +#ifdef __FreeBSD__ + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; +#else + diff = (tevbuf[j] - tevbuf[j-1]) / 1000; +#endif + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + +#ifdef __FreeBSD__ + tevbuf[i++] = rdtsc(); +#else + tevbuf[i++] = gethrtime(); +#endif + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(4); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(s, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +int +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); + return (0); +} diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c new file mode 100644 index 0000000000..e78f88f074 --- /dev/null +++ b/usr/src/cmd/bhyve/mptbl.c @@ -0,0 +1,379 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <x86/mptable.h> + +#include <stdio.h> +#include <string.h> + +#include "acpi.h" +#include "bhyverun.h" +#include "mptbl.h" +#include "pci_emul.h" + +#define MPTABLE_BASE 0xE0000 + +/* floating pointer length + maximum length of configuration table */ +#define MPTABLE_MAX_LENGTH (65536 + 16) + +#define LAPIC_PADDR 0xFEE00000 +#define LAPIC_VERSION 16 + +#define IOAPIC_PADDR 0xFEC00000 +#define IOAPIC_VERSION 0x11 + +#define MP_SPECREV 4 +#define MPFP_SIG "_MP_" + +/* Configuration header defines */ +#define MPCH_SIG "PCMP" +#define MPCH_OEMID "BHyVe " +#define MPCH_OEMID_LEN 8 +#define MPCH_PRODID "Hypervisor " +#define MPCH_PRODID_LEN 12 + +/* Processor entry defines */ +#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */ +#define MPEP_SIG_MODEL 26 +#define MPEP_SIG_STEPPING 5 +#define MPEP_SIG \ + ((MPEP_SIG_FAMILY << 8) | \ + (MPEP_SIG_MODEL << 4) | \ + (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */ + +/* Number of local intr entries */ +#define MPEII_NUM_LOCAL_IRQ 2 + +/* Bus entry defines */ +#define MPE_NUM_BUSES 2 +#define MPE_BUSNAME_LEN 6 +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " + +static void *oem_tbl_start; +static int oem_tbl_size; + +static uint8_t +mpt_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes; + uint8_t sum; + + for(bytes = base, sum = 0; len > 0; len--) { + sum += *bytes++; + } + + return (256 - sum); +} + +static void +mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa) +{ + + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIG, 4); + mpfp->pap = gpa + sizeof(*mpfp); + mpfp->length = 1; + mpfp->spec_rev = MP_SPECREV; + mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mpt_build_mpch(mpcth_t mpch) +{ + + memset(mpch, 0, sizeof(*mpch)); + memcpy(mpch->signature, MPCH_SIG, 4); + mpch->spec_rev = MP_SPECREV; + memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->apic_address = LAPIC_PADDR; +} + +static void +mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu) +{ + int i; + + for (i = 0; i < ncpu; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->type = MPCT_ENTRY_PROCESSOR; + mpep->apic_id = i; // XXX + mpep->apic_version = LAPIC_VERSION; + mpep->cpu_flags = PROCENTRY_FLAG_EN; + if (i == 0) + mpep->cpu_flags |= PROCENTRY_FLAG_BP; + mpep->cpu_signature = MPEP_SIG; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } +} + +static void +mpt_build_localint_entries(int_entry_ptr mpie) +{ + + /* Hardcode LINT0 as ExtINT on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_EXTINT; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 0; + mpie++; + + /* Hardcode LINT1 as NMI on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_NMI; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 1; +} + +static void +mpt_build_bus_entries(bus_entry_ptr mpeb) +{ + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 0; + memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 1; + memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); +} + +static void +mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id) +{ + + memset(mpei, 0, sizeof(*mpei)); + mpei->type = MPCT_ENTRY_IOAPIC; + mpei->apic_id = id; + mpei->apic_version = IOAPIC_VERSION; + mpei->apic_flags = IOAPICENTRY_FLAG_EN; + mpei->apic_address = IOAPIC_PADDR; +} + +static int +mpt_count_ioint_entries(void) +{ + int bus, count; + + count = 0; + for (bus = 0; bus <= PCI_BUSMAX; bus++) + count += pci_count_lintr(bus); + + /* + * Always include entries for the first 16 pins along with a entry + * for each active PCI INTx pin. + */ + return (16 + count); +} + +static void +mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + int_entry_ptr *mpiep, mpie; + + mpiep = arg; + mpie = *mpiep; + memset(mpie, 0, sizeof(*mpie)); + + /* + * This is always after another I/O interrupt entry, so cheat + * and fetch the I/O APIC ID from the prior entry. + */ + mpie->type = MPCT_ENTRY_INT; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_id = bus; + mpie->src_bus_irq = slot << 2 | (pin - 1); + mpie->dst_apic_id = mpie[-1].dst_apic_id; + mpie->dst_apic_int = ioapic_irq; + + *mpiep = mpie + 1; +} + +static void +mpt_build_ioint_entries(int_entry_ptr mpie, int id) +{ + int pin, bus; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + /* First, generate the first 16 pins. */ + for (pin = 0; pin < 16; pin++) { + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_INT; + mpie->src_bus_id = 1; + mpie->dst_apic_id = id; + + /* + * All default configs route IRQs from bus 0 to the first 16 + * pins of the first I/O APIC with an APIC ID of 2. + */ + mpie->dst_apic_int = pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpie->int_type = INTENTRY_TYPE_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = 0; + break; + case SCI_INT: + /* ACPI SCI is level triggered and active-lo. */ + mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO | + INTENTRY_FLAGS_TRIGGER_LEVEL; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = SCI_INT; + break; + default: + /* All other pins are identity mapped. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = pin; + break; + } + mpie++; + } + + /* Next, generate entries for any PCI INTx interrupts. */ + for (bus = 0; bus <= PCI_BUSMAX; bus++) + pci_walk_lintr(bus, mpt_generate_pci_int, &mpie); +} + +void +mptable_add_oemtbl(void *tbl, int tblsz) +{ + + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +mptable_build(struct vmctx *ctx, int ncpu) +{ + mpcth_t mpch; + bus_entry_ptr mpeb; + io_apic_entry_ptr mpei; + proc_entry_ptr mpep; + mpfps_t mpfp; + int_entry_ptr mpie; + int ioints, bus; + char *curraddr; + char *startaddr; + + startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "mptable requires mapped mem\n"); + return (ENOMEM); + } + + /* + * There is no way to advertise multiple PCI hierarchies via MPtable + * so require that there is no PCI hierarchy with a non-zero bus + * number. + */ + for (bus = 1; bus <= PCI_BUSMAX; bus++) { + if (pci_bus_configured(bus)) { + fprintf(stderr, "MPtable is incompatible with " + "multiple PCI hierarchies.\r\n"); + fprintf(stderr, "MPtable generation can be disabled " + "by passing the -Y option to bhyve(8).\r\n"); + return (EINVAL); + } + } + + curraddr = startaddr; + mpfp = (mpfps_t)curraddr; + mpt_build_mpfp(mpfp, MPTABLE_BASE); + curraddr += sizeof(*mpfp); + + mpch = (mpcth_t)curraddr; + mpt_build_mpch(mpch); + curraddr += sizeof(*mpch); + + mpep = (proc_entry_ptr)curraddr; + mpt_build_proc_entries(mpep, ncpu); + curraddr += sizeof(*mpep) * ncpu; + mpch->entry_count += ncpu; + + mpeb = (bus_entry_ptr) curraddr; + mpt_build_bus_entries(mpeb); + curraddr += sizeof(*mpeb) * MPE_NUM_BUSES; + mpch->entry_count += MPE_NUM_BUSES; + + mpei = (io_apic_entry_ptr)curraddr; + mpt_build_ioapic_entries(mpei, 0); + curraddr += sizeof(*mpei); + mpch->entry_count++; + + mpie = (int_entry_ptr) curraddr; + ioints = mpt_count_ioint_entries(); + mpt_build_ioint_entries(mpie, 0); + curraddr += sizeof(*mpie) * ioints; + mpch->entry_count += ioints; + + mpie = (int_entry_ptr)curraddr; + mpt_build_localint_entries(mpie); + curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ; + mpch->entry_count += MPEII_NUM_LOCAL_IRQ; + + if (oem_tbl_start) { + mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE; + mpch->oem_table_size = oem_tbl_size; + memcpy(curraddr, oem_tbl_start, oem_tbl_size); + } + + mpch->base_table_length = curraddr - (char *)mpch; + mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length); + + return (0); +} diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h new file mode 100644 index 0000000000..ebc8d85ea8 --- /dev/null +++ b/usr/src/cmd/bhyve/mptbl.h @@ -0,0 +1,37 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MPTBL_H_ +#define _MPTBL_H_ + +int mptable_build(struct vmctx *ctx, int ncpu); +void mptable_add_oemtbl(void *tbl, int tblsz); + +#endif /* _MPTBL_H_ */ diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c new file mode 100644 index 0000000000..1e3feffcc2 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_ahci.c @@ -0,0 +1,2485 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org> + * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <sys/disk.h> +#include <sys/ata.h> +#include <sys/endian.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> +#include <inttypes.h> +#include <md5.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "ahci.h" +#include "block_if.h" + +#define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ +#define MAX_PORTS 32 /* AHCI supports 32 ports */ + +#define PxSIG_ATA 0x00000101 /* ATA drive */ +#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ + +enum sata_fis_type { + FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ + FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ + FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ + FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ + FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ + FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ + FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ + FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ +}; + +/* + * SCSI opcodes + */ +#define TEST_UNIT_READY 0x00 +#define REQUEST_SENSE 0x03 +#define INQUIRY 0x12 +#define START_STOP_UNIT 0x1B +#define PREVENT_ALLOW 0x1E +#define READ_CAPACITY 0x25 +#define READ_10 0x28 +#define POSITION_TO_ELEMENT 0x2B +#define READ_TOC 0x43 +#define GET_EVENT_STATUS_NOTIFICATION 0x4A +#define MODE_SENSE_10 0x5A +#define REPORT_LUNS 0xA0 +#define READ_12 0xA8 +#define READ_CD 0xBE + +/* + * SCSI mode page codes + */ +#define MODEPAGE_RW_ERROR_RECOVERY 0x01 +#define MODEPAGE_CD_CAPABILITIES 0x2A + +/* + * ATA commands + */ +#define ATA_SF_ENAB_SATA_SF 0x10 +#define ATA_SATA_SF_AN 0x05 +#define ATA_SF_DIS_SATA_SF 0x90 + +/* + * Debug printf + */ +#ifdef AHCI_DEBUG +static FILE *dbg; +#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) +#else +#define DPRINTF(format, arg...) +#endif +#define WPRINTF(format, arg...) printf(format, ##arg) + +#define AHCI_PORT_IDENT 20 + 1 + +struct ahci_ioreq { + struct blockif_req io_req; + struct ahci_port *io_pr; + STAILQ_ENTRY(ahci_ioreq) io_flist; + TAILQ_ENTRY(ahci_ioreq) io_blist; + uint8_t *cfis; + uint32_t len; + uint32_t done; + int slot; + int more; +}; + +struct ahci_port { + struct blockif_ctxt *bctx; + struct pci_ahci_softc *pr_sc; + uint8_t *cmd_lst; + uint8_t *rfis; + char ident[AHCI_PORT_IDENT]; + int port; + int atapi; + int reset; + int waitforclear; + int mult_sectors; + uint8_t xfermode; + uint8_t err_cfis[20]; + uint8_t sense_key; + uint8_t asc; + u_int ccs; + uint32_t pending; + + uint32_t clb; + uint32_t clbu; + uint32_t fb; + uint32_t fbu; + uint32_t is; + uint32_t ie; + uint32_t cmd; + uint32_t unused0; + uint32_t tfd; + uint32_t sig; + uint32_t ssts; + uint32_t sctl; + uint32_t serr; + uint32_t sact; + uint32_t ci; + uint32_t sntf; + uint32_t fbs; + + /* + * i/o request info + */ + struct ahci_ioreq *ioreq; + int ioqsz; + STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; + TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; +}; + +struct ahci_cmd_hdr { + uint16_t flags; + uint16_t prdtl; + uint32_t prdbc; + uint64_t ctba; + uint32_t reserved[4]; +}; + +struct ahci_prdt_entry { + uint64_t dba; + uint32_t reserved; +#define DBCMASK 0x3fffff + uint32_t dbc; +}; + +struct pci_ahci_softc { + struct pci_devinst *asc_pi; + pthread_mutex_t mtx; + int ports; + uint32_t cap; + uint32_t ghc; + uint32_t is; + uint32_t pi; + uint32_t vs; + uint32_t ccc_ctl; + uint32_t ccc_pts; + uint32_t em_loc; + uint32_t em_ctl; + uint32_t cap2; + uint32_t bohc; + uint32_t lintr; + struct ahci_port port[MAX_PORTS]; +}; +#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) + +static void ahci_handle_port(struct ahci_port *p); + +static inline void lba_to_msf(uint8_t *buf, int lba) +{ + lba += 150; + buf[0] = (lba / 75) / 60; + buf[1] = (lba / 75) % 60; + buf[2] = lba % 75; +} + +/* + * Generate HBA interrupts on global IS register write. + */ +static void +ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) +{ + struct pci_devinst *pi = sc->asc_pi; + struct ahci_port *p; + int i, nmsg; + uint32_t mmask; + + /* Update global IS from PxIS/PxIE. */ + for (i = 0; i < sc->ports; i++) { + p = &sc->port[i]; + if (p->is & p->ie) + sc->is |= (1 << i); + } + DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is); + + /* If there is nothing enabled -- clear legacy interrupt and exit. */ + if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { + if (sc->lintr) { + pci_lintr_deassert(pi); + sc->lintr = 0; + } + return; + } + + /* If there is anything and no MSI -- assert legacy interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (nmsg == 0) { + if (!sc->lintr) { + sc->lintr = 1; + pci_lintr_assert(pi); + } + return; + } + + /* Assert respective MSIs for ports that were touched. */ + for (i = 0; i < nmsg; i++) { + if (sc->ports <= nmsg || i < nmsg - 1) + mmask = 1 << i; + else + mmask = 0xffffffff << i; + if (sc->is & mask && mmask & mask) + pci_generate_msi(pi, i); + } +} + +/* + * Generate HBA interrupt on specific port event. + */ +static void +ahci_port_intr(struct ahci_port *p) +{ + struct pci_ahci_softc *sc = p->pr_sc; + struct pci_devinst *pi = sc->asc_pi; + int nmsg; + + DPRINTF("%s(%d) %08x/%08x %08x\n", __func__, + p->port, p->is, p->ie, sc->is); + + /* If there is nothing enabled -- we are done. */ + if ((p->is & p->ie) == 0) + return; + + /* In case of non-shared MSI always generate interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (sc->ports <= nmsg || p->port < nmsg - 1) { + sc->is |= (1 << p->port); + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + pci_generate_msi(pi, p->port); + return; + } + + /* If IS for this port is already set -- do nothing. */ + if (sc->is & (1 << p->port)) + return; + + sc->is |= (1 << p->port); + + /* If interrupts are enabled -- generate one. */ + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + if (nmsg > 0) { + pci_generate_msi(pi, nmsg - 1); + } else if (!sc->lintr) { + sc->lintr = 1; + pci_lintr_assert(pi); + } +} + +static void +ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) +{ + int offset, len, irq; + + if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) + return; + + switch (ft) { + case FIS_TYPE_REGD2H: + offset = 0x40; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; + break; + case FIS_TYPE_SETDEVBITS: + offset = 0x58; + len = 8; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; + break; + case FIS_TYPE_PIOSETUP: + offset = 0x20; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; + break; + default: + WPRINTF("unsupported fis type %d\n", ft); + return; + } + if (fis[2] & ATA_S_ERROR) { + p->waitforclear = 1; + irq |= AHCI_P_IX_TFE; + } + memcpy(p->rfis + offset, fis, len); + if (irq) { + if (~p->is & irq) { + p->is |= irq; + ahci_port_intr(p); + } + } +} + +static void +ahci_write_fis_piosetup(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_PIOSETUP; + ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); +} + +static void +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[8]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + tfd &= 0x77; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_SETDEVBITS; + fis[1] = (1 << 6); + fis[2] = tfd; + fis[3] = error; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = slot; + p->err_cfis[2] = tfd; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else { + *(uint32_t *)(fis + 4) = (1 << slot); + p->sact &= ~(1 << slot); + } + p->tfd &= ~0x77; + p->tfd |= tfd; + ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); +} + +static void +ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[20]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = (1 << 6); + fis[2] = tfd & 0xff; + fis[3] = error; + fis[4] = cfis[4]; + fis[5] = cfis[5]; + fis[6] = cfis[6]; + fis[7] = cfis[7]; + fis[8] = cfis[8]; + fis[9] = cfis[9]; + fis[10] = cfis[10]; + fis[11] = cfis[11]; + fis[12] = cfis[12]; + fis[13] = cfis[13]; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = 0x80; + p->err_cfis[2] = tfd & 0xff; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else + p->ci &= ~(1 << slot); + p->tfd = tfd; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) +{ + uint8_t fis[20]; + + p->tfd = ATA_S_READY | ATA_S_DSC; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = 0; /* No interrupt */ + fis[2] = p->tfd; /* Status */ + fis[3] = 0; /* No error */ + p->ci &= ~(1 << slot); + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_reset_fis_d2h(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[3] = 1; + fis[4] = 1; + if (p->atapi) { + fis[5] = 0x14; + fis[6] = 0xeb; + } + fis[12] = 1; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_check_stopped(struct ahci_port *p) +{ + /* + * If we are no longer processing the command list and nothing + * is in-flight, clear the running bit, the current command + * slot, the command issue and active bits. + */ + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (p->pending == 0) { + p->ccs = 0; + p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); + p->ci = 0; + p->sact = 0; + p->waitforclear = 0; + } + } +} + +static void +ahci_port_stop(struct ahci_port *p) +{ + struct ahci_ioreq *aior; + uint8_t *cfis; + int slot; + int error; + + assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); + + TAILQ_FOREACH(aior, &p->iobhd, io_blist) { + /* + * Try to cancel the outstanding blockif request. + */ + error = blockif_cancel(p->bctx, &aior->io_req); + if (error != 0) + continue; + + slot = aior->slot; + cfis = aior->cfis; + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + p->sact &= ~(1 << slot); /* NCQ */ + else + p->ci &= ~(1 << slot); + + /* + * This command is now done. + */ + p->pending &= ~(1 << slot); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + } + + ahci_check_stopped(p); +} + +static void +ahci_port_reset(struct ahci_port *pr) +{ + pr->serr = 0; + pr->sact = 0; + pr->xfermode = ATA_UDMA6; + pr->mult_sectors = 128; + + if (!pr->bctx) { + pr->ssts = ATA_SS_DET_NO_DEVICE; + pr->sig = 0xFFFFFFFF; + pr->tfd = 0x7F; + return; + } + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; + if (pr->sctl & ATA_SC_SPD_MASK) + pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); + else + pr->ssts |= ATA_SS_SPD_GEN3; + pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; + if (!pr->atapi) { + pr->sig = PxSIG_ATA; + pr->tfd |= ATA_S_READY; + } else + pr->sig = PxSIG_ATAPI; + ahci_write_reset_fis_d2h(pr); +} + +static void +ahci_reset(struct pci_ahci_softc *sc) +{ + int i; + + sc->ghc = AHCI_GHC_AE; + sc->is = 0; + + if (sc->lintr) { + pci_lintr_deassert(sc->asc_pi); + sc->lintr = 0; + } + + for (i = 0; i < sc->ports; i++) { + sc->port[i].ie = 0; + sc->port[i].is = 0; + sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); + if (sc->port[i].bctx) + sc->port[i].cmd |= AHCI_P_CMD_CPS; + sc->port[i].sctl = 0; + ahci_port_reset(&sc->port[i]); + } +} + +static void +ata_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i ^ 1] = *src++; + else + dest[i ^ 1] = ' '; + } +} + +static void +atapi_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i] = *src++; + else + dest[i] = ' '; + } +} + +/* + * Build up the iovec based on the PRDT, 'done' and 'len'. + */ +static void +ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, + struct ahci_prdt_entry *prdt, uint16_t prdtl) +{ + struct blockif_req *breq = &aior->io_req; + int i, j, skip, todo, left, extra; + uint32_t dbcsz; + + /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ + skip = aior->done; + left = aior->len - aior->done; + todo = 0; + for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; + i++, prdt++) { + dbcsz = (prdt->dbc & DBCMASK) + 1; + /* Skip already done part of the PRDT */ + if (dbcsz <= skip) { + skip -= dbcsz; + continue; + } + dbcsz -= skip; + if (dbcsz > left) + dbcsz = left; + breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), + prdt->dba + skip, dbcsz); + breq->br_iov[j].iov_len = dbcsz; + todo += dbcsz; + left -= dbcsz; + skip = 0; + j++; + } + + /* If we got limited by IOV length, round I/O down to sector size. */ + if (j == BLOCKIF_IOV_MAX) { + extra = todo % blockif_sectsz(p->bctx); + todo -= extra; + assert(todo > 0); + while (extra > 0) { + if (breq->br_iov[j - 1].iov_len > extra) { + breq->br_iov[j - 1].iov_len -= extra; + break; + } + extra -= breq->br_iov[j - 1].iov_len; + j--; + } + } + + breq->br_iovcnt = j; + breq->br_resid = todo; + aior->done += todo; + aior->more = (aior->done < aior->len && i < prdtl); +} + +static void +ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + struct ahci_prdt_entry *prdt; + struct ahci_cmd_hdr *hdr; + uint64_t lba; + uint32_t len; + int err, first, ncq, readop; + + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + ncq = 0; + readop = 1; + first = (done == 0); + + if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) + readop = 0; + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[11] << 8 | cfis[3]; + if (!len) + len = 65536; + ncq = 1; + } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[13] << 8 | cfis[12]; + if (!len) + len = 65536; + } else { + lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | + (cfis[5] << 8) | cfis[4]; + len = cfis[12]; + if (!len) + len = 256; + } + lba *= blockif_sectsz(p->bctx); + len *= blockif_sectsz(p->bctx); + + /* Pull request off free list */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + if (readop) + err = blockif_read(p->bctx, breq); + else + err = blockif_write(p->bctx, breq); + assert(err == 0); +} + +static void +ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + int err; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = 0; + aior->done = 0; + aior->more = 0; + breq = &aior->io_req; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_flush(p->bctx, breq); + assert(err == 0); +} + +static inline void +read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *to; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + to = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = MIN(len, dbcsz); + memcpy(to, ptr, sublen); + len -= sublen; + to += sublen; + prdt++; + } +} + +static void +ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + uint8_t *entry; + uint64_t elba; + uint32_t len, elen; + int err, first, ncq; + uint8_t buf[512]; + + first = (done == 0); + if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { + len = (uint16_t)cfis[13] << 8 | cfis[12]; + len *= 512; + ncq = 0; + } else { /* ATA_SEND_FPDMA_QUEUED */ + len = (uint16_t)cfis[11] << 8 | cfis[3]; + len *= 512; + ncq = 1; + } + read_prdt(p, slot, cfis, buf, sizeof(buf)); + +next: + entry = &buf[done]; + elba = ((uint64_t)entry[5] << 40) | + ((uint64_t)entry[4] << 32) | + ((uint64_t)entry[3] << 24) | + ((uint64_t)entry[2] << 16) | + ((uint64_t)entry[1] << 8) | + entry[0]; + elen = (uint16_t)entry[7] << 8 | entry[6]; + done += 8; + if (elen == 0) { + if (done >= len) { + if (ncq) { + if (first) + ahci_write_fis_d2h_ncq(p, slot); + ahci_write_fis_sdb(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } else { + ahci_write_fis_d2h(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } + p->pending &= ~(1 << slot); + ahci_check_stopped(p); + if (!first) + ahci_handle_port(p); + return; + } + goto next; + } + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + aior->more = (len != done); + + breq = &aior->io_req; + breq->br_offset = elba * blockif_sectsz(p->bctx); + breq->br_resid = elen * blockif_sectsz(p->bctx); + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + err = blockif_delete(p->bctx, breq); + assert(err == 0); +} + +static inline void +write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *from; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + from = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = MIN(len, dbcsz); + memcpy(ptr, from, sublen); + len -= sublen; + from += sublen; + prdt++; + } + hdr->prdbc = size - len; +} + +static void +ahci_checksum(uint8_t *buf, int size) +{ + int i; + uint8_t sum = 0; + + for (i = 0; i < size - 1; i++) + sum += buf[i]; + buf[size - 1] = 0x100 - sum; +} + +static void +ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + uint32_t buf[128]; + uint8_t *buf8 = (uint8_t *)buf; + uint16_t *buf16 = (uint16_t *)buf; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || + cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + memset(buf, 0, sizeof(buf)); + if (cfis[4] == 0x00) { /* Log directory */ + buf16[0x00] = 1; /* Version -- 1 */ + buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ + buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ + } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ + memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); + ahci_checksum(buf8, sizeof(buf)); + } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ + if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { + buf[0x00] = 1; /* SFQ DSM supported */ + buf[0x01] = 1; /* SFQ DSM TRIM supported */ + } + } else { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + if (cfis[2] == ATA_READ_LOG_EXT) + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); +} + +static void +handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + uint64_t sectors; + int sectsz, psectsz, psectoff, candelete, ro; + uint16_t cyl; + uint8_t sech, heads; + + ro = blockif_is_ro(p->bctx); + candelete = blockif_candelete(p->bctx); + sectsz = blockif_sectsz(p->bctx); + sectors = blockif_size(p->bctx) / sectsz; + blockif_chs(p->bctx, &cyl, &heads, &sech); + blockif_psectsz(p->bctx, &psectsz, &psectoff); + memset(buf, 0, sizeof(buf)); + buf[0] = 0x0040; + buf[1] = cyl; + buf[3] = heads; + buf[6] = sech; + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); + buf[47] = (0x8000 | 128); + buf[48] = 0; + buf[49] = (1 << 8 | 1 << 9 | 1 << 11); + buf[50] = (1 << 14); + buf[53] = (1 << 1 | 1 << 2); + if (p->mult_sectors) + buf[59] = (0x100 | p->mult_sectors); + if (sectors <= 0x0fffffff) { + buf[60] = sectors; + buf[61] = (sectors >> 16); + } else { + buf[60] = 0xffff; + buf[61] = 0x0fff; + } + buf[63] = 0x7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 0x3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[69] = 0; + buf[75] = 31; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[80] = 0x3f0; + buf[81] = 0x28; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[100] = sectors; + buf[101] = (sectors >> 16); + buf[102] = (sectors >> 32); + buf[103] = (sectors >> 48); + if (candelete && !ro) { + buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + buf[105] = 1; + buf[169] = ATA_SUPPORT_DSM_TRIM; + } + buf[106] = 0x4000; + buf[209] = 0x4000; + if (psectsz > sectsz) { + buf[106] |= 0x2000; + buf[106] |= ffsl(psectsz / sectsz) - 1; + buf[209] |= (psectoff / sectsz); + } + if (sectsz > 512) { + buf[106] |= 0x1000; + buf[117] = sectsz / 2; + buf[118] = ((sectsz / 2) >> 16); + } + buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + + memset(buf, 0, sizeof(buf)); + buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); + buf[49] = (1 << 9 | 1 << 8); + buf[50] = (1 << 14 | 1); + buf[53] = (1 << 2 | 1 << 1); + buf[62] = 0x3f; + buf[63] = 7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[78] = (1 << 5); + buf[80] = 0x3f0; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[83] = (1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[36]; + uint8_t *acmd; + int len; + uint32_t tfd; + + acmd = cfis + 0x40; + + if (acmd[1] & 1) { /* VPD */ + if (acmd[2] == 0) { /* Supported VPD pages */ + buf[0] = 0x05; + buf[1] = 0; + buf[2] = 0; + buf[3] = 1; + buf[4] = 0; + len = 4 + buf[3]; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + } else { + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + len = sizeof(buf); + } + + if (len > acmd[4]) + len = acmd[4]; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, len); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[8]; + uint64_t sectors; + + sectors = blockif_size(p->bctx) / 2048; + be32enc(buf, sectors - 1); + be32enc(buf + 4, 2048); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint8_t format; + int len; + + acmd = cfis + 0x40; + + len = be16dec(acmd + 7); + format = acmd[9] >> 6; + switch (format) { + case 0: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, buf[20], *bp; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + if (start_track > 1 && start_track != 0xaa) { + uint32_t tfd; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + if (start_track <= 1) { + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 1; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + } + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 0xaa; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 1: + { + uint8_t buf[12]; + + memset(buf, 0, sizeof(buf)); + buf[1] = 0xa; + buf[2] = 0x1; + buf[3] = 0x1; + if (len > sizeof(buf)) + len = sizeof(buf); + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 2: + { + int msf, size; + uint64_t sectors; + uint8_t *bp, buf[50]; + + msf = (acmd[1] >> 1) & 1; + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa2; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + default: + { + uint32_t tfd; + + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + break; + } + } +} + +static void +atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[16]; + + memset(buf, 0, sizeof(buf)); + buf[3] = 8; + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct blockif_req *breq; + uint8_t *acmd; + uint64_t lba; + uint32_t len; + int err; + + acmd = cfis + 0x40; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + + lba = be32dec(acmd + 2); + if (acmd[0] == READ_10) + len = be16dec(acmd + 7); + else + len = be32dec(acmd + 6); + if (len == 0) { + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + } + lba *= 2048; + len *= 2048; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_read(p->bctx, breq); + assert(err == 0); +} + +static void +atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[64]; + uint8_t *acmd; + int len; + + acmd = cfis + 0x40; + len = acmd[4]; + if (len > sizeof(buf)) + len = sizeof(buf); + memset(buf, 0, len); + buf[0] = 0x70 | (1 << 7); + buf[2] = p->sense_key; + buf[7] = 10; + buf[12] = p->asc; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd = cfis + 0x40; + uint32_t tfd; + + switch (acmd[4] & 3) { + case 0: + case 1: + case 3: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + tfd = ATA_S_READY | ATA_S_DSC; + break; + case 2: + /* TODO eject media */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x53; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd = 0; + uint8_t pc, code; + int len; + + acmd = cfis + 0x40; + len = be16dec(acmd + 7); + pc = acmd[2] >> 6; + code = acmd[2] & 0x3f; + + switch (pc) { + case 0: + switch (code) { + case MODEPAGE_RW_ERROR_RECOVERY: + { + uint8_t buf[16]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 16 - 2); + buf[2] = 0x70; + buf[8] = 0x01; + buf[9] = 16 - 10; + buf[11] = 0x05; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + case MODEPAGE_CD_CAPABILITIES: + { + uint8_t buf[30]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 30 - 2); + buf[2] = 0x70; + buf[8] = 0x2A; + buf[9] = 30 - 10; + buf[10] = 0x08; + buf[12] = 0x71; + be16enc(&buf[18], 2); + be16enc(&buf[20], 512); + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + default: + goto error; + break; + } + break; + case 3: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x39; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; +error: + case 1: + case 2: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_get_event_status_notification(struct ahci_port *p, int slot, + uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + + acmd = cfis + 0x40; + + /* we don't support asynchronous operation */ + if (!(acmd[1] & 1)) { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + } else { + uint8_t buf[8]; + int len; + + len = be16dec(acmd + 7); + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 8 - 2); + buf[2] = 0x04; + buf[3] = 0x10; + buf[5] = 0x02; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + + acmd = cfis + 0x40; + +#ifdef AHCI_DEBUG + { + int i; + DPRINTF("ACMD:"); + for (i = 0; i < 16; i++) + DPRINTF("%02x ", acmd[i]); + DPRINTF("\n"); + } +#endif + + switch (acmd[0]) { + case TEST_UNIT_READY: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case INQUIRY: + atapi_inquiry(p, slot, cfis); + break; + case READ_CAPACITY: + atapi_read_capacity(p, slot, cfis); + break; + case PREVENT_ALLOW: + /* TODO */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case READ_TOC: + atapi_read_toc(p, slot, cfis); + break; + case REPORT_LUNS: + atapi_report_luns(p, slot, cfis); + break; + case READ_10: + case READ_12: + atapi_read(p, slot, cfis, 0); + break; + case REQUEST_SENSE: + atapi_request_sense(p, slot, cfis); + break; + case START_STOP_UNIT: + atapi_start_stop_unit(p, slot, cfis); + break; + case MODE_SENSE_10: + atapi_mode_sense(p, slot, cfis); + break; + case GET_EVENT_STATUS_NOTIFICATION: + atapi_get_event_status_notification(p, slot, cfis); + break; + default: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x20; + ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | + ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + + p->tfd |= ATA_S_BUSY; + switch (cfis[2]) { + case ATA_ATA_IDENTIFY: + handle_identify(p, slot, cfis); + break; + case ATA_SETFEATURES: + { + switch (cfis[3]) { + case ATA_SF_ENAB_SATA_SF: + switch (cfis[12]) { + case ATA_SATA_SF_AN: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + break; + case ATA_SF_ENAB_WCACHE: + case ATA_SF_DIS_WCACHE: + case ATA_SF_ENAB_RCACHE: + case ATA_SF_DIS_RCACHE: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + case ATA_SF_SETXFER: + { + switch (cfis[12] & 0xf8) { + case ATA_PIO: + case ATA_PIO0: + break; + case ATA_WDMA0: + case ATA_UDMA0: + p->xfermode = (cfis[12] & 0x7); + break; + } + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + } + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + } + case ATA_SET_MULTI: + if (cfis[12] != 0 && + (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + } else { + p->mult_sectors = cfis[12]; + p->tfd = ATA_S_DSC | ATA_S_READY; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + case ATA_READ: + case ATA_WRITE: + case ATA_READ48: + case ATA_WRITE48: + case ATA_READ_MUL: + case ATA_WRITE_MUL: + case ATA_READ_MUL48: + case ATA_WRITE_MUL48: + case ATA_READ_DMA: + case ATA_WRITE_DMA: + case ATA_READ_DMA48: + case ATA_WRITE_DMA48: + case ATA_READ_FPDMA_QUEUED: + case ATA_WRITE_FPDMA_QUEUED: + ahci_handle_rw(p, slot, cfis, 0); + break; + case ATA_FLUSHCACHE: + case ATA_FLUSHCACHE48: + ahci_handle_flush(p, slot, cfis); + break; + case ATA_DATA_SET_MANAGEMENT: + if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && + cfis[13] == 0 && cfis[12] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_SEND_FPDMA_QUEUED: + if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && + cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && + cfis[11] == 0 && cfis[3] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_READ_LOG_EXT: + case ATA_READ_LOG_DMA_EXT: + ahci_handle_read_log(p, slot, cfis); + break; + case ATA_SECURITY_FREEZE_LOCK: + case ATA_SMART_CMD: + case ATA_NOP: + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_CHECK_POWER_MODE: + cfis[12] = 0xff; /* always on */ + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_STANDBY_CMD: + case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_CMD: + case ATA_IDLE_IMMEDIATE: + case ATA_SLEEP: + case ATA_READ_VERIFY: + case ATA_READ_VERIFY48: + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_ATAPI_IDENTIFY: + handle_atapi_identify(p, slot, cfis); + break; + case ATA_PACKET_CMD: + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else + handle_packet_cmd(p, slot, cfis); + break; + default: + WPRINTF("Unsupported cmd:%02x\n", cfis[2]); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_slot(struct ahci_port *p, int slot) +{ + struct ahci_cmd_hdr *hdr; +#ifdef AHCI_DEBUG + struct ahci_prdt_entry *prdt; +#endif + struct pci_ahci_softc *sc; + uint8_t *cfis; +#ifdef AHCI_DEBUG + int cfl, i; +#endif + + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); +#ifdef AHCI_DEBUG + cfl = (hdr->flags & 0x1f) * 4; +#endif + cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); +#ifdef AHCI_DEBUG + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + + DPRINTF("\ncfis:"); + for (i = 0; i < cfl; i++) { + if (i % 10 == 0) + DPRINTF("\n"); + DPRINTF("%02x ", cfis[i]); + } + DPRINTF("\n"); + + for (i = 0; i < hdr->prdtl; i++) { + DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba); + prdt++; + } +#endif + + if (cfis[0] != FIS_TYPE_REGH2D) { + WPRINTF("Not a H2D FIS:%02x\n", cfis[0]); + return; + } + + if (cfis[1] & 0x80) { + ahci_handle_cmd(p, slot, cfis); + } else { + if (cfis[15] & (1 << 2)) + p->reset = 1; + else if (p->reset) { + p->reset = 0; + ahci_port_reset(p); + } + p->ci &= ~(1 << slot); + } +} + +static void +ahci_handle_port(struct ahci_port *p) +{ + + if (!(p->cmd & AHCI_P_CMD_ST)) + return; + + /* + * Search for any new commands to issue ignoring those that + * are already in-flight. Stop if device is busy or in error. + */ + for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { + if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) + break; + if (p->waitforclear) + break; + if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { + p->cmd &= ~AHCI_P_CMD_CCS_MASK; + p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, p->ccs); + } + } +} + +/* + * blockif callback routine - this runs in the context of the blockif + * i/o thread, so the mutex needs to be acquired. + */ +static void +ata_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint32_t tfd; + uint8_t *cfis; + int slot, ncq, dsm; + + DPRINTF("%s %d\n", __func__, err); + + ncq = dsm = 0; + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + ncq = 1; + if (cfis[2] == ATA_DATA_SET_MANAGEMENT || + (cfis[2] == ATA_SEND_FPDMA_QUEUED && + (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) + dsm = 1; + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + if (dsm) + ahci_handle_dsm_trim(p, slot, cfis, aior->done); + else + ahci_handle_rw(p, slot, cfis, aior->done); + goto out; + } + + if (!err) + tfd = ATA_S_READY | ATA_S_DSC; + else + tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + if (ncq) + ahci_write_fis_sdb(p, slot, cfis, tfd); + else + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +atapi_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint8_t *cfis; + uint32_t tfd; + int slot; + + DPRINTF("%s %d\n", __func__, err); + + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + atapi_read(p, slot, cfis, aior->done); + goto out; + } + + if (!err) { + tfd = ATA_S_READY | ATA_S_DSC; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x21; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +pci_ahci_ioreq_init(struct ahci_port *pr) +{ + struct ahci_ioreq *vr; + int i; + + pr->ioqsz = blockif_queuesz(pr->bctx); + pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); + STAILQ_INIT(&pr->iofhd); + + /* + * Add all i/o request entries to the free queue + */ + for (i = 0; i < pr->ioqsz; i++) { + vr = &pr->ioreq[i]; + vr->io_pr = pr; + if (!pr->atapi) + vr->io_req.br_callback = ata_ioreq_cb; + else + vr->io_req.br_callback = atapi_ioreq_cb; + vr->io_req.br_param = vr; + STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); + } + + TAILQ_INIT(&pr->iobhd); +} + +static void +pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + struct ahci_port *p = &sc->port[port]; + + DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + port, offset, value); + + switch (offset) { + case AHCI_P_CLB: + p->clb = value; + break; + case AHCI_P_CLBU: + p->clbu = value; + break; + case AHCI_P_FB: + p->fb = value; + break; + case AHCI_P_FBU: + p->fbu = value; + break; + case AHCI_P_IS: + p->is &= ~value; + ahci_port_intr(p); + break; + case AHCI_P_IE: + p->ie = value & 0xFDC000FF; + ahci_port_intr(p); + break; + case AHCI_P_CMD: + { + p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); + p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; + + if (!(value & AHCI_P_CMD_ST)) { + ahci_port_stop(p); + } else { + uint64_t clb; + + p->cmd |= AHCI_P_CMD_CR; + clb = (uint64_t)p->clbu << 32 | p->clb; + p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, + AHCI_CL_SIZE * AHCI_MAX_SLOTS); + } + + if (value & AHCI_P_CMD_FRE) { + uint64_t fb; + + p->cmd |= AHCI_P_CMD_FR; + fb = (uint64_t)p->fbu << 32 | p->fb; + /* we don't support FBSCP, so rfis size is 256Bytes */ + p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); + } else { + p->cmd &= ~AHCI_P_CMD_FR; + } + + if (value & AHCI_P_CMD_CLO) { + p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); + p->cmd &= ~AHCI_P_CMD_CLO; + } + + if (value & AHCI_P_CMD_ICC_MASK) { + p->cmd &= ~AHCI_P_CMD_ICC_MASK; + } + + ahci_handle_port(p); + break; + } + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_P_SCTL: + p->sctl = value; + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (value & ATA_SC_DET_RESET) + ahci_port_reset(p); + } + break; + case AHCI_P_SERR: + p->serr &= ~value; + break; + case AHCI_P_SACT: + p->sact |= value; + break; + case AHCI_P_CI: + p->ci |= value; + ahci_handle_port(p); + break; + case AHCI_P_SNTF: + case AHCI_P_FBS: + default: + break; + } +} + +static void +pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + offset, value); + + switch (offset) { + case AHCI_CAP: + case AHCI_PI: + case AHCI_VS: + case AHCI_CAP2: + DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_GHC: + if (value & AHCI_GHC_HR) { + ahci_reset(sc); + break; + } + if (value & AHCI_GHC_IE) + sc->ghc |= AHCI_GHC_IE; + else + sc->ghc &= ~AHCI_GHC_IE; + ahci_generate_intr(sc, 0xffffffff); + break; + case AHCI_IS: + sc->is &= ~value; + ahci_generate_intr(sc, value); + break; + default: + break; + } +} + +static void +pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + + assert(baridx == 5); + assert((offset % 4) == 0 && size == 4); + + pthread_mutex_lock(&sc->mtx); + + if (offset < AHCI_OFFSET) + pci_ahci_host_write(sc, offset, value); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + pci_ahci_port_write(sc, offset, value); + else + WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + + switch (offset) { + case AHCI_CAP: + case AHCI_GHC: + case AHCI_IS: + case AHCI_PI: + case AHCI_VS: + case AHCI_CCCC: + case AHCI_CCCP: + case AHCI_EM_LOC: + case AHCI_EM_CTL: + case AHCI_CAP2: + { + uint32_t *p = &sc->cap; + p += (offset - AHCI_CAP) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n", + offset, value); + + return (value); +} + +static uint64_t +pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + + switch (offset) { + case AHCI_P_CLB: + case AHCI_P_CLBU: + case AHCI_P_FB: + case AHCI_P_FBU: + case AHCI_P_IS: + case AHCI_P_IE: + case AHCI_P_CMD: + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + case AHCI_P_SCTL: + case AHCI_P_SERR: + case AHCI_P_SACT: + case AHCI_P_CI: + case AHCI_P_SNTF: + case AHCI_P_FBS: + { + uint32_t *p= &sc->port[port].clb; + p += (offset - AHCI_P_CLB) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + + DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n", + port, offset, value); + + return value; +} + +static uint64_t +pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t regoff, int size) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + uint64_t offset; + uint32_t value; + + assert(baridx == 5); + assert(size == 1 || size == 2 || size == 4); + assert((regoff & (size - 1)) == 0); + + pthread_mutex_lock(&sc->mtx); + + offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ + if (offset < AHCI_OFFSET) + value = pci_ahci_host_read(sc, offset); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + value = pci_ahci_port_read(sc, offset); + else { + value = 0; + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", + regoff); + } + value >>= 8 * (regoff & 0x3); + + pthread_mutex_unlock(&sc->mtx); + + return (value); +} + +static int +pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) +{ + char bident[sizeof("XX:XX:XX")]; + struct blockif_ctxt *bctxt; + struct pci_ahci_softc *sc; + int ret, slots, p; + MD5_CTX mdctx; + u_char digest[16]; + char *next, *next2; + + ret = 0; + +#ifdef AHCI_DEBUG + dbg = fopen("/tmp/log", "w+"); +#endif + + sc = calloc(1, sizeof(struct pci_ahci_softc)); + pi->pi_arg = sc; + sc->asc_pi = pi; + pthread_mutex_init(&sc->mtx, NULL); + sc->ports = 0; + sc->pi = 0; + slots = 32; + + for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { + /* Identify and cut off type of present port. */ + if (strncmp(opts, "hd:", 3) == 0) { + atapi = 0; + opts += 3; + } else if (strncmp(opts, "cd:", 3) == 0) { + atapi = 1; + opts += 3; + } + + /* Find and cut off the next port options. */ + next = strstr(opts, ",hd:"); + next2 = strstr(opts, ",cd:"); + if (next == NULL || (next2 != NULL && next2 < next)) + next = next2; + if (next != NULL) { + next[0] = 0; + next++; + } + + if (opts[0] == 0) + continue; + + /* + * Attempt to open the backing image. Use the PCI slot/func + * and the port number for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, + pi->pi_func, p); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + sc->ports = p; + ret = 1; + goto open_fail; + } + sc->port[p].bctx = bctxt; + sc->port[p].pr_sc = sc; + sc->port[p].port = p; + sc->port[p].atapi = atapi; + +#ifndef __FreeBSD__ + /* + * Attempt to enable the write cache for this device, as the + * guest will issue FLUSH commands when it requires durability. + * + * Failure here is fine, since an always-sync device will not + * have an impact on correctness. + */ + (void) blockif_set_wce(bctxt, 1); +#endif + + /* + * Create an identifier for the backing file. + * Use parts of the md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->port[p].ident, AHCI_PORT_IDENT, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[p]); + + sc->pi |= (1 << p); + if (sc->port[p].ioqsz < slots) + slots = sc->port[p].ioqsz; + } + sc->ports = p; + + /* Intel ICH8 AHCI */ + --slots; + if (sc->ports < DEF_PORTS) + sc->ports = DEF_PORTS; + sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | + AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | + AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| + AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | + (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); + + sc->vs = 0x10300; + sc->cap2 = AHCI_CAP2_APST; + ahci_reset(sc); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); + pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); + p = MIN(sc->ports, 16); + p = flsl(p) - ((p & (p - 1)) ? 0 : 1); + pci_emul_add_msicap(pi, 1 << p); + pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, + AHCI_OFFSET + sc->ports * AHCI_STEP); + + pci_lintr_request(pi); + +open_fail: + if (ret) { + for (p = 0; p < sc->ports; p++) { + if (sc->port[p].bctx != NULL) + blockif_close(sc->port[p].bctx); + } + free(sc); + } + + return (ret); +} + +static int +pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 0)); +} + +static int +pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 1)); +} + +/* + * Use separate emulation names to distinguish drive and atapi devices + */ +struct pci_devemu pci_de_ahci = { + .pe_emu = "ahci", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci); + +struct pci_devemu pci_de_ahci_hd = { + .pe_emu = "ahci-hd", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_hd); + +struct pci_devemu pci_de_ahci_cd = { + .pe_emu = "ahci-cd", + .pe_init = pci_ahci_atapi_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c new file mode 100644 index 0000000000..e211b5cf9c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_e82545.c @@ -0,0 +1,2418 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org> + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * Copyright (c) 2013 Jeremiah Lott, Avere Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/limits.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#ifndef __FreeBSD__ +#include <sys/filio.h> +#endif + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <md5.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "e1000_regs.h" +#include "e1000_defines.h" +#include "mii.h" + +#include "bhyverun.h" +#include "pci_emul.h" +#include "mevent.h" + +/* Hardware/register definitions XXX: move some to common code. */ +#define E82545_VENDOR_ID_INTEL 0x8086 +#define E82545_DEV_ID_82545EM_COPPER 0x100F +#define E82545_SUBDEV_ID 0x1008 + +#define E82545_REVISION_4 4 + +#define E82545_MDIC_DATA_MASK 0x0000FFFF +#define E82545_MDIC_OP_MASK 0x0c000000 +#define E82545_MDIC_IE 0x20000000 + +#define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ +#define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ +#define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ + +#define E82545_BAR_REGISTER 0 +#define E82545_BAR_REGISTER_LEN (128*1024) +#define E82545_BAR_FLASH 1 +#define E82545_BAR_FLASH_LEN (64*1024) +#define E82545_BAR_IO 2 +#define E82545_BAR_IO_LEN 8 + +#define E82545_IOADDR 0x00000000 +#define E82545_IODATA 0x00000004 +#define E82545_IO_REGISTER_MAX 0x0001FFFF +#define E82545_IO_FLASH_BASE 0x00080000 +#define E82545_IO_FLASH_MAX 0x000FFFFF + +#define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) +#define E82545_RAR_MAX 15 +#define E82545_MTA_MAX 127 +#define E82545_VFTA_MAX 127 + +/* Slightly modified from the driver versions, hardcoded for 3 opcode bits, + * followed by 6 address bits. + * TODO: make opcode bits and addr bits configurable? + * NVM Commands - Microwire */ +#define E82545_NVM_OPCODE_BITS 3 +#define E82545_NVM_ADDR_BITS 6 +#define E82545_NVM_DATA_BITS 16 +#define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) +#define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) +#define E82545_NVM_OPCODE_MASK \ + (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) +#define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ +#define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ +#define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ +#define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ + +#define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ + +#define E1000_ICR_SRPD 0x00010000 + +/* This is an arbitrary number. There is no hard limit on the chip. */ +#define I82545_MAX_TXSEGS 64 + +/* Legacy receive descriptor */ +struct e1000_rx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + uint16_t length; /* Length of data DMAed into data buffer */ + uint16_t csum; /* Packet checksum */ + uint8_t status; /* Descriptor status */ + uint8_t errors; /* Descriptor Errors */ + uint16_t special; +}; + +/* Transmit descriptor types */ +#define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) +#define E1000_TXD_TYP_L (0) +#define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) +#define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) + +/* Legacy transmit descriptor */ +struct e1000_tx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t cso; /* Checksum offset */ + uint8_t cmd; /* Descriptor control */ + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t css; /* Checksum start */ + uint16_t special; + } fields; + } upper; +}; + +/* Context descriptor */ +struct e1000_context_desc { + union { + uint32_t ip_config; + struct { + uint8_t ipcss; /* IP checksum start */ + uint8_t ipcso; /* IP checksum offset */ + uint16_t ipcse; /* IP checksum end */ + } ip_fields; + } lower_setup; + union { + uint32_t tcp_config; + struct { + uint8_t tucss; /* TCP checksum start */ + uint8_t tucso; /* TCP checksum offset */ + uint16_t tucse; /* TCP checksum end */ + } tcp_fields; + } upper_setup; + uint32_t cmd_and_length; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t hdr_len; /* Header length */ + uint16_t mss; /* Maximum segment size */ + } fields; + } tcp_seg_setup; +}; + +/* Data descriptor */ +struct e1000_data_desc { + uint64_t buffer_addr; /* Address of the descriptor's buffer address */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t typ_len_ext; + uint8_t cmd; + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t popts; /* Packet Options */ + uint16_t special; + } fields; + } upper; +}; + +union e1000_tx_udesc { + struct e1000_tx_desc td; + struct e1000_context_desc cd; + struct e1000_data_desc dd; +}; + +/* Tx checksum info for a packet. */ +struct ck_info { + int ck_valid; /* ck_info is valid */ + uint8_t ck_start; /* start byte of cksum calcuation */ + uint8_t ck_off; /* offset of cksum insertion */ + uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ +}; + +/* + * Debug printf + */ +static int e82545_debug = 0; +#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params) +#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params) + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +/* s/w representation of the RAL/RAH regs */ +struct eth_uni { + int eu_valid; + int eu_addrsel; + struct ether_addr eu_eth; +}; + + +struct e82545_softc { + struct pci_devinst *esc_pi; + struct vmctx *esc_ctx; + struct mevent *esc_mevp; + struct mevent *esc_mevpitr; + pthread_mutex_t esc_mtx; + struct ether_addr esc_mac; + int esc_tapfd; + + /* General */ + uint32_t esc_CTRL; /* x0000 device ctl */ + uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ + uint32_t esc_FCAH; /* x002C flow ctl addr hi */ + uint32_t esc_FCT; /* x0030 flow ctl type */ + uint32_t esc_VET; /* x0038 VLAN eth type */ + uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ + uint32_t esc_LEDCTL; /* x0E00 LED control */ + uint32_t esc_PBA; /* x1000 pkt buffer allocation */ + + /* Interrupt control */ + int esc_irq_asserted; + uint32_t esc_ICR; /* x00C0 cause read/clear */ + uint32_t esc_ITR; /* x00C4 intr throttling */ + uint32_t esc_ICS; /* x00C8 cause set */ + uint32_t esc_IMS; /* x00D0 mask set/read */ + uint32_t esc_IMC; /* x00D8 mask clear */ + + /* Transmit */ + union e1000_tx_udesc *esc_txdesc; + struct e1000_context_desc esc_txctx; + pthread_t esc_tx_tid; + pthread_cond_t esc_tx_cond; + int esc_tx_enabled; + int esc_tx_active; + uint32_t esc_TXCW; /* x0178 transmit config */ + uint32_t esc_TCTL; /* x0400 transmit ctl */ + uint32_t esc_TIPG; /* x0410 inter-packet gap */ + uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ + uint64_t esc_tdba; /* verified 64-bit desc table addr */ + uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ + uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ + uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ + uint16_t esc_TDH; /* x3810 desc table head idx */ + uint16_t esc_TDHr; /* internal read version of TDH */ + uint16_t esc_TDT; /* x3818 desc table tail idx */ + uint32_t esc_TIDV; /* x3820 intr delay */ + uint32_t esc_TXDCTL; /* x3828 desc control */ + uint32_t esc_TADV; /* x382C intr absolute delay */ + + /* L2 frame acceptance */ + struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ + uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ + uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ + + /* Receive */ + struct e1000_rx_desc *esc_rxdesc; + pthread_cond_t esc_rx_cond; + int esc_rx_enabled; + int esc_rx_active; + int esc_rx_loopback; + uint32_t esc_RCTL; /* x0100 receive ctl */ + uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ + uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ + uint64_t esc_rdba; /* verified 64-bit desc table addr */ + uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ + uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ + uint32_t esc_RDLEN; /* x2808 #descriptors */ + uint16_t esc_RDH; /* x2810 desc table head idx */ + uint16_t esc_RDT; /* x2818 desc table tail idx */ + uint32_t esc_RDTR; /* x2820 intr delay */ + uint32_t esc_RXDCTL; /* x2828 desc control */ + uint32_t esc_RADV; /* x282C intr absolute delay */ + uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ + uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ + + /* IO Port register access */ + uint32_t io_addr; + + /* Shadow copy of MDIC */ + uint32_t mdi_control; + /* Shadow copy of EECD */ + uint32_t eeprom_control; + /* Latest NVM in/out */ + uint16_t nvm_data; + uint16_t nvm_opaddr; + /* stats */ + uint32_t missed_pkt_count; /* dropped for no room in rx queue */ + uint32_t pkt_rx_by_size[6]; + uint32_t pkt_tx_by_size[6]; + uint32_t good_pkt_rx_count; + uint32_t bcast_pkt_rx_count; + uint32_t mcast_pkt_rx_count; + uint32_t good_pkt_tx_count; + uint32_t bcast_pkt_tx_count; + uint32_t mcast_pkt_tx_count; + uint32_t oversize_rx_count; + uint32_t tso_tx_count; + uint64_t good_octets_rx; + uint64_t good_octets_tx; + uint64_t missed_octets; /* counts missed and oversized */ + + uint8_t nvm_bits:6; /* number of bits remaining in/out */ + uint8_t nvm_mode:2; +#define E82545_NVM_MODE_OPADDR 0x0 +#define E82545_NVM_MODE_DATAIN 0x1 +#define E82545_NVM_MODE_DATAOUT 0x2 + /* EEPROM data */ + uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; +}; + +static void e82545_reset(struct e82545_softc *sc, int dev); +static void e82545_rx_enable(struct e82545_softc *sc); +static void e82545_rx_disable(struct e82545_softc *sc); +#ifdef __FreeBSD__ +static void e82545_tap_callback(int fd, enum ev_type type, void *param); +#endif +static void e82545_tx_start(struct e82545_softc *sc); +static void e82545_tx_enable(struct e82545_softc *sc); +static void e82545_tx_disable(struct e82545_softc *sc); + +static inline int +e82545_size_stat_index(uint32_t size) +{ + if (size <= 64) { + return 0; + } else if (size >= 1024) { + return 5; + } else { + /* should be 1-4 */ + return (ffs(size) - 6); + } +} + +static void +e82545_init_eeprom(struct e82545_softc *sc) +{ + uint16_t checksum, i; + + /* mac addr */ + sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | + (((uint16_t)sc->esc_mac.octet[1]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | + (((uint16_t)sc->esc_mac.octet[3]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | + (((uint16_t)sc->esc_mac.octet[5]) << 8); + + /* pci ids */ + sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; + sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; + sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; + sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; + + /* fill in the checksum */ + checksum = 0; + for (i = 0; i < NVM_CHECKSUM_REG; i++) { + checksum += sc->eeprom_data[i]; + } + checksum = NVM_SUM - checksum; + sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; + DPRINTF("eeprom checksum: 0x%x\r\n", checksum); +} + +static void +e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr, uint32_t data) +{ + DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data); +} + +static uint32_t +e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr) +{ + //DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + switch (reg_addr) { + case PHY_STATUS: + return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | + MII_SR_AUTONEG_COMPLETE); + case PHY_AUTONEG_ADV: + return NWAY_AR_SELECTOR_FIELD; + case PHY_LP_ABILITY: + return 0; + case PHY_1000T_STATUS: + return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | + SR_1000T_LOCAL_RX_STATUS); + case PHY_ID1: + return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; + case PHY_ID2: + return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; + default: + DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + return 0; + } + /* not reached */ +} + +static void +e82545_eecd_strobe(struct e82545_softc *sc) +{ + /* Microwire state machine */ + /* + DPRINTF("eeprom state machine srtobe " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data);*/ + + if (sc->nvm_bits == 0) { + DPRINTF("eeprom state machine not expecting data! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + return; + } + sc->nvm_bits--; + if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { + /* shifting out */ + if (sc->nvm_data & 0x8000) { + sc->eeprom_control |= E1000_EECD_DO; + } else { + sc->eeprom_control &= ~E1000_EECD_DO; + } + sc->nvm_data <<= 1; + if (sc->nvm_bits == 0) { + /* read done, back to opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { + /* shifting in */ + sc->nvm_data <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_data |= 1; + } + if (sc->nvm_bits == 0) { + /* eeprom write */ + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; + if (op != E82545_NVM_OPCODE_WRITE) { + DPRINTF("Illegal eeprom write op 0x%x\r\n", + sc->nvm_opaddr); + } else if (addr >= E82545_NVM_EEPROM_SIZE) { + DPRINTF("Illegal eeprom write addr 0x%x\r\n", + sc->nvm_opaddr); + } else { + DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + sc->eeprom_data[addr] = sc->nvm_data; + } + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { + sc->nvm_opaddr <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_opaddr |= 1; + } + if (sc->nvm_bits == 0) { + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + switch (op) { + case E82545_NVM_OPCODE_EWEN: + DPRINTF("eeprom write enable: 0x%x\r\n", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + break; + case E82545_NVM_OPCODE_READ: + { + uint16_t addr = sc->nvm_opaddr & + E82545_NVM_ADDR_MASK; + sc->nvm_mode = E82545_NVM_MODE_DATAOUT; + sc->nvm_bits = E82545_NVM_DATA_BITS; + if (addr < E82545_NVM_EEPROM_SIZE) { + sc->nvm_data = sc->eeprom_data[addr]; + DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + } else { + DPRINTF("eeprom illegal read: 0x%x\r\n", + sc->nvm_opaddr); + sc->nvm_data = 0; + } + break; + } + case E82545_NVM_OPCODE_WRITE: + sc->nvm_mode = E82545_NVM_MODE_DATAIN; + sc->nvm_bits = E82545_NVM_DATA_BITS; + sc->nvm_data = 0; + break; + default: + DPRINTF("eeprom unknown op: 0x%x\r\r", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } + } else { + DPRINTF("eeprom state machine wrong state! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + } +} + +#ifdef __FreeBSD__ +static void +e82545_itr_callback(int fd, enum ev_type type, void *param) +{ + uint32_t new; + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + new = sc->esc_ICR & sc->esc_IMS; + if (new && !sc->esc_irq_asserted) { + DPRINTF("itr callback: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + } else { + mevent_delete(sc->esc_mevpitr); + sc->esc_mevpitr = NULL; + } + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static void +e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + DPRINTF("icr assert: 0x%x\r\n", bits); + + /* + * An interrupt is only generated if bits are set that + * aren't already in the ICR, these bits are unmasked, + * and there isn't an interrupt already pending. + */ + new = bits & ~sc->esc_ICR & sc->esc_IMS; + sc->esc_ICR |= bits; + + if (new == 0) { + DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("icr assert: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_ims_change(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + /* + * Changing the mask may allow previously asserted + * but masked interrupt requests to generate an interrupt. + */ + new = bits & sc->esc_ICR & ~sc->esc_IMS; + sc->esc_IMS |= bits; + + if (new == 0) { + DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("ims change: lintr assert %x\n\r", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) +{ + + DPRINTF("icr deassert: 0x%x\r\n", bits); + sc->esc_ICR &= ~bits; + + /* + * If there are no longer any interrupt sources and there + * was an asserted interrupt, clear it + */ + if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { + DPRINTF("icr deassert: lintr deassert %x\r\n", bits); + pci_lintr_deassert(sc->esc_pi); + sc->esc_irq_asserted = 0; + } +} + +static void +e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + + DPRINTF("intr_write: off %x, val %x\n\r", offset, value); + + switch (offset) { + case E1000_ICR: + e82545_icr_deassert(sc, value); + break; + case E1000_ITR: + sc->esc_ITR = value; + break; + case E1000_ICS: + sc->esc_ICS = value; /* not used: store for debug */ + e82545_icr_assert(sc, value); + break; + case E1000_IMS: + e82545_ims_change(sc, value); + break; + case E1000_IMC: + sc->esc_IMC = value; /* for debug */ + sc->esc_IMS &= ~value; + // XXX clear interrupts if all ICR bits now masked + // and interrupt was pending ? + break; + default: + break; + } +} + +static uint32_t +e82545_intr_read(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + + retval = 0; + + DPRINTF("intr_read: off %x\n\r", offset); + + switch (offset) { + case E1000_ICR: + retval = sc->esc_ICR; + sc->esc_ICR = 0; + e82545_icr_deassert(sc, ~0); + break; + case E1000_ITR: + retval = sc->esc_ITR; + break; + case E1000_ICS: + /* write-only register */ + break; + case E1000_IMS: + retval = sc->esc_IMS; + break; + case E1000_IMC: + /* write-only register */ + break; + default: + break; + } + + return (retval); +} + +static void +e82545_devctl(struct e82545_softc *sc, uint32_t val) +{ + + sc->esc_CTRL = val & ~E1000_CTRL_RST; + + if (val & E1000_CTRL_RST) { + DPRINTF("e1k: s/w reset, ctl %x\n", val); + e82545_reset(sc, 1); + } + /* XXX check for phy reset ? */ +} + +static void +e82545_rx_update_rdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | + sc->esc_RDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, + sc->esc_rdba, sc->esc_RDLEN); +} + +static void +e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); + + /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ + sc->esc_RCTL = val & ~0xF9204c01; + + DPRINTF("rx_ctl - %s RCTL %x, val %x\n", + on ? "on" : "off", sc->esc_RCTL, val); + + /* state change requested */ + if (on != sc->esc_rx_enabled) { + if (on) { + /* Catch disallowed/unimplemented settings */ + //assert(!(val & E1000_RCTL_LBM_TCVR)); + + if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { + sc->esc_rx_loopback = 1; + } else { + sc->esc_rx_loopback = 0; + } + + e82545_rx_update_rdba(sc); + e82545_rx_enable(sc); + } else { + e82545_rx_disable(sc); + sc->esc_rx_loopback = 0; + sc->esc_rdba = 0; + sc->esc_rxdesc = NULL; + } + } +} + +static void +e82545_tx_update_tdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, + sc->esc_TDLEN); +} + +static void +e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); + + /* ignore TCTL_EN settings that don't change state */ + if (on == sc->esc_tx_enabled) + return; + + if (on) { + e82545_tx_update_tdba(sc); + e82545_tx_enable(sc); + } else { + e82545_tx_disable(sc); + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + } + + /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ + sc->esc_TCTL = val & ~0xFE800005; +} + +int +e82545_bufsz(uint32_t rctl) +{ + + switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { + case (E1000_RCTL_SZ_2048): return (2048); + case (E1000_RCTL_SZ_1024): return (1024); + case (E1000_RCTL_SZ_512): return (512); + case (E1000_RCTL_SZ_256): return (256); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); + } + return (256); /* Forbidden value. */ +} + +#ifdef __FreeBSD__ +static uint8_t dummybuf[2048]; + +/* XXX one packet at a time until this is debugged */ +static void +e82545_tap_callback(int fd, enum ev_type type, void *param) +{ + struct e82545_softc *sc = param; + struct e1000_rx_desc *rxd; + struct iovec vec[64]; + int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; + uint32_t cause = 0; + uint16_t *tp, tag, head; + + pthread_mutex_lock(&sc->esc_mtx); + DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + + if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { + DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n", + sc->esc_rx_enabled, sc->esc_rx_loopback); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + bufsz = e82545_bufsz(sc->esc_RCTL); + maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; + maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; + size = sc->esc_RDLEN / 16; + head = sc->esc_RDH; + left = (size + sc->esc_RDT - head) % size; + if (left < maxpktdesc) { + DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n", + left, maxpktdesc); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + + sc->esc_rx_active = 1; + pthread_mutex_unlock(&sc->esc_mtx); + + for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { + + /* Grab rx descriptor pointed to by the head pointer */ + for (i = 0; i < maxpktdesc; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + vec[i].iov_base = paddr_guest2host(sc->esc_ctx, + rxd->buffer_addr, bufsz); + vec[i].iov_len = bufsz; + } + len = readv(sc->esc_tapfd, vec, maxpktdesc); + if (len <= 0) { + DPRINTF("tap: readv() returned %d\n", len); + goto done; + } + + /* + * Adjust the packet length based on whether the CRC needs + * to be stripped or if the packet is less than the minimum + * eth packet size. + */ + if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) + len = ETHER_MIN_LEN - ETHER_CRC_LEN; + if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) + len += ETHER_CRC_LEN; + n = (len + bufsz - 1) / bufsz; + + DPRINTF("packet read %d bytes, %d segs, head %d\r\n", + len, n, head); + + /* Apply VLAN filter. */ + tp = (uint16_t *)vec[0].iov_base + 6; + if ((sc->esc_RCTL & E1000_RCTL_VFE) && + (ntohs(tp[0]) == sc->esc_VET)) { + tag = ntohs(tp[1]) & 0x0fff; + if ((sc->esc_fvlan[tag >> 5] & + (1 << (tag & 0x1f))) != 0) { + DPRINTF("known VLAN %d\r\n", tag); + } else { + DPRINTF("unknown VLAN %d\r\n", tag); + n = 0; + continue; + } + } + + /* Update all consumed descriptors. */ + for (i = 0; i < n - 1; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + rxd->status = E1000_RXD_STAT_DD; + } + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = len % bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + /* XXX signal no checksum for now */ + rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | + E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; + + /* Schedule receive interrupts. */ + if (len <= sc->esc_RSRPD) { + cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; + } else { + /* XXX: RDRT and RADV timers should be here. */ + cause |= E1000_ICR_RXT0; + } + + head = (head + n) % size; + left -= n; + } + +done: + pthread_mutex_lock(&sc->esc_mtx); + sc->esc_rx_active = 0; + if (sc->esc_rx_enabled == 0) + pthread_cond_signal(&sc->esc_rx_cond); + + sc->esc_RDH = head; + /* Respect E1000_RCTL_RDMTS */ + left = (size + sc->esc_RDT - head) % size; + if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) + cause |= E1000_ICR_RXDMT0; + /* Assert all accumulated interrupts. */ + if (cause != 0) + e82545_icr_assert(sc, cause); +done1: + DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static uint16_t +e82545_carry(uint32_t sum) +{ + + sum = (sum & 0xFFFF) + (sum >> 16); + if (sum > 0xFFFF) + sum -= 0xFFFF; + return (sum); +} + +static uint16_t +#ifdef __FreeBSD__ +e82545_buf_checksum(uint8_t *buf, int len) +#else +e82545_buf_checksum(caddr_t buf, int len) +#endif +{ + int i; + uint32_t sum = 0; + + /* Checksum all the pairs of bytes first... */ + for (i = 0; i < (len & ~1U); i += 2) + sum += *((u_int16_t *)(buf + i)); + + /* + * If there's a single byte left over, checksum it, too. + * Network byte order is big-endian, so the remaining byte is + * the high byte. + */ + if (i < len) + sum += htons(buf[i] << 8); + + return (e82545_carry(sum)); +} + +static uint16_t +e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) +{ + int now, odd; + uint32_t sum = 0, s; + + /* Skip completely unneeded vectors. */ + while (iovcnt > 0 && iov->iov_len <= off && off > 0) { + off -= iov->iov_len; + iov++; + iovcnt--; + } + + /* Calculate checksum of requested range. */ + odd = 0; + while (len > 0 && iovcnt > 0) { + now = MIN(len, iov->iov_len - off); + s = e82545_buf_checksum(iov->iov_base + off, now); + sum += odd ? (s << 8) : s; + odd ^= (now & 1); + len -= now; + off = 0; + iov++; + iovcnt--; + } + + return (e82545_carry(sum)); +} + +/* + * Return the transmit descriptor type. + */ +int +e82545_txdesc_type(uint32_t lower) +{ + int type; + + type = 0; + + if (lower & E1000_TXD_CMD_DEXT) + type = lower & E1000_TXD_MASK; + + return (type); +} + +static void +e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) +{ + uint16_t cksum; + int cklen; + + DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n", + iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); + cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; + cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); + *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; +} + +static void +e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) +{ + + if (sc->esc_tapfd == -1) + return; + + (void) writev(sc->esc_tapfd, iov, iovcnt); +} + +static void +e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, int *tdwb) +{ + union e1000_tx_udesc *dsc; + + for ( ; head != tail; head = (head + 1) % dsize) { + dsc = &sc->esc_txdesc[head]; + if (dsc->td.lower.data & E1000_TXD_CMD_RS) { + dsc->td.upper.data |= E1000_TXD_STAT_DD; + *tdwb = 1; + } + } +} + +static int +e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, uint16_t *rhead, int *tdwb) +{ +#ifdef __FreeBSD__ + uint8_t *hdr, *hdrp; +#else + caddr_t hdr, hdrp; +#endif + struct iovec iovb[I82545_MAX_TXSEGS + 2]; + struct iovec tiov[I82545_MAX_TXSEGS + 2]; + struct e1000_context_desc *cd; + struct ck_info ckinfo[2]; + struct iovec *iov; + union e1000_tx_udesc *dsc; + int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso; + int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; + uint32_t tcpsum, tcpseq; + uint16_t ipcs, tcpcs, ipid, ohead; + + ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; + iovcnt = 0; + tlen = 0; + ntype = 0; + tso = 0; + ohead = head; + hdr = NULL; + + /* iovb[0/1] may be used for writable copy of headers. */ + iov = &iovb[2]; + + for (desc = 0; ; desc++, head = (head + 1) % dsize) { + if (head == tail) { + *rhead = head; + return (0); + } + dsc = &sc->esc_txdesc[head]; + dtype = e82545_txdesc_type(dsc->td.lower.data); + + if (desc == 0) { + switch (dtype) { + case E1000_TXD_TYP_C: + DPRINTF("tx ctxt desc idx %d: %016jx " + "%08x%08x\r\n", + head, dsc->td.buffer_addr, + dsc->td.upper.data, dsc->td.lower.data); + /* Save context and return */ + sc->esc_txctx = dsc->cd; + goto done; + case E1000_TXD_TYP_L: + DPRINTF("tx legacy desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + /* + * legacy cksum start valid in first descriptor + */ + ntype = dtype; + ckinfo[0].ck_start = dsc->td.upper.fields.css; + break; + case E1000_TXD_TYP_D: + DPRINTF("tx data desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + ntype = dtype; + break; + default: + break; + } + } else { + /* Descriptor type must be consistent */ + assert(dtype == ntype); + DPRINTF("tx next desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + } + + len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : + dsc->dd.lower.data & 0xFFFFF; + + if (len > 0) { + /* Strip checksum supplied by guest. */ + if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && + (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) + len -= 2; + tlen += len; + if (iovcnt < I82545_MAX_TXSEGS) { + iov[iovcnt].iov_base = paddr_guest2host( + sc->esc_ctx, dsc->td.buffer_addr, len); + iov[iovcnt].iov_len = len; + } + iovcnt++; + } + + /* + * Pull out info that is valid in the final descriptor + * and exit descriptor loop. + */ + if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { + if (dtype == E1000_TXD_TYP_L) { + if (dsc->td.lower.data & E1000_TXD_CMD_IC) { + ckinfo[0].ck_valid = 1; + ckinfo[0].ck_off = + dsc->td.lower.flags.cso; + ckinfo[0].ck_len = 0; + } + } else { + cd = &sc->esc_txctx; + if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) + tso = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM) + ckinfo[0].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM || tso) { + ckinfo[0].ck_start = + cd->lower_setup.ip_fields.ipcss; + ckinfo[0].ck_off = + cd->lower_setup.ip_fields.ipcso; + ckinfo[0].ck_len = + cd->lower_setup.ip_fields.ipcse; + } + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM) + ckinfo[1].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM || tso) { + ckinfo[1].ck_start = + cd->upper_setup.tcp_fields.tucss; + ckinfo[1].ck_off = + cd->upper_setup.tcp_fields.tucso; + ckinfo[1].ck_len = + cd->upper_setup.tcp_fields.tucse; + } + } + break; + } + } + + if (iovcnt > I82545_MAX_TXSEGS) { + WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n", + iovcnt, I82545_MAX_TXSEGS); + goto done; + } + + hdrlen = vlen = 0; + /* Estimate writable space for VLAN header insertion. */ + if ((sc->esc_CTRL & E1000_CTRL_VME) && + (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { + hdrlen = ETHER_ADDR_LEN*2; + vlen = ETHER_VLAN_ENCAP_LEN; + } + if (!tso) { + /* Estimate required writable space for checksums. */ + if (ckinfo[0].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); + if (ckinfo[1].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); + /* Round up writable space to the first vector. */ + if (hdrlen != 0 && iov[0].iov_len > hdrlen && + iov[0].iov_len < hdrlen + 100) + hdrlen = iov[0].iov_len; + } else { + /* In case of TSO header length provided by software. */ + hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; + } + + /* Allocate, fill and prepend writable header vector. */ + if (hdrlen != 0) { + hdr = __builtin_alloca(hdrlen + vlen); + hdr += vlen; + for (left = hdrlen, hdrp = hdr; left > 0; + left -= now, hdrp += now) { + now = MIN(left, iov->iov_len); + memcpy(hdrp, iov->iov_base, now); + iov->iov_base += now; + iov->iov_len -= now; + if (iov->iov_len == 0) { + iov++; + iovcnt--; + } + } + iov--; + iovcnt++; + iov->iov_base = hdr; + iov->iov_len = hdrlen; + } + + /* Insert VLAN tag. */ + if (vlen != 0) { + hdr -= ETHER_VLAN_ENCAP_LEN; + memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); + hdrlen += ETHER_VLAN_ENCAP_LEN; + hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; + hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; + hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; + hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; + iov->iov_base = hdr; + iov->iov_len += ETHER_VLAN_ENCAP_LEN; + /* Correct checksum offsets after VLAN tag insertion. */ + ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[0].ck_len != 0) + ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[1].ck_len != 0) + ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; + } + + /* Simple non-TSO case. */ + if (!tso) { + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); + if (ckinfo[1].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); + e82545_transmit_backend(sc, iov, iovcnt); + goto done; + } + + /* Doing TSO. */ + tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; + mss = sc->esc_txctx.tcp_seg_setup.fields.mss; + paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); + DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n", + tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); + ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); + tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); + ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; + tcpcs = 0; + if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ + tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; + pv = 1; + pvoff = 0; + for (seg = 0, left = paylen; left > 0; seg++, left -= now) { + now = MIN(left, mss); + + /* Construct IOVs for the segment. */ + /* Include whole original header. */ + tiov[0].iov_base = hdr; + tiov[0].iov_len = hdrlen; + tiovcnt = 1; + /* Include respective part of payload IOV. */ + for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { + nnow = MIN(nleft, iov[pv].iov_len - pvoff); + tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; + tiov[tiovcnt++].iov_len = nnow; + if (pvoff + nnow == iov[pv].iov_len) { + pv++; + pvoff = 0; + } else + pvoff += nnow; + } + DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n", + seg, hdrlen, now, tiovcnt); + + /* Update IP header. */ + if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { + /* IPv4 -- set length and ID */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = + htons(hdrlen - ckinfo[0].ck_start + now); + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(ipid + seg); + } else { + /* IPv6 -- set length */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(hdrlen - ckinfo[0].ck_start - 40 + + now); + } + + /* Update pseudo-header checksum. */ + tcpsum = tcpcs; + tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); + + /* Update TCP/UDP headers. */ + if (tcp) { + /* Update sequence number and FIN/PUSH flags. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + htonl(tcpseq + paylen - left); + if (now < left) { + hdr[ckinfo[1].ck_start + 13] &= + ~(TH_FIN | TH_PUSH); + } + } else { + /* Update payload length. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + hdrlen - ckinfo[1].ck_start + now; + } + + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) { + *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); + } + if (ckinfo[1].ck_valid) { + *(uint16_t *)&hdr[ckinfo[1].ck_off] = + e82545_carry(tcpsum); + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); + } + e82545_transmit_backend(sc, tiov, tiovcnt); + } + +done: + head = (head + 1) % dsize; + e82545_transmit_done(sc, ohead, head, dsize, tdwb); + + *rhead = head; + return (desc + 1); +} + +static void +e82545_tx_run(struct e82545_softc *sc) +{ + uint32_t cause; + uint16_t head, rhead, tail, size; + int lim, tdwb, sent; + + head = sc->esc_TDH; + tail = sc->esc_TDT; + size = sc->esc_TDLEN / 16; + DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); + + pthread_mutex_unlock(&sc->esc_mtx); + rhead = head; + tdwb = 0; + for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { + sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); + if (sent == 0) + break; + head = rhead; + } + pthread_mutex_lock(&sc->esc_mtx); + + sc->esc_TDH = head; + sc->esc_TDHr = rhead; + cause = 0; + if (tdwb) + cause |= E1000_ICR_TXDW; + if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) + cause |= E1000_ICR_TXQE; + if (cause) + e82545_icr_assert(sc, cause); + + DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); +} + +static void * +e82545_tx_thread(void *param) +{ + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + for (;;) { + while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { + if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) + break; + sc->esc_tx_active = 0; + if (sc->esc_tx_enabled == 0) + pthread_cond_signal(&sc->esc_tx_cond); + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); + } + sc->esc_tx_active = 1; + + /* Process some tx descriptors. Lock dropped inside. */ + e82545_tx_run(sc); + } +#ifndef __FreeBSD__ + return (NULL); +#endif +} + +static void +e82545_tx_start(struct e82545_softc *sc) +{ + + if (sc->esc_tx_active == 0) + pthread_cond_signal(&sc->esc_tx_cond); +} + +static void +e82545_tx_enable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 1; +} + +static void +e82545_tx_disable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 0; + while (sc->esc_tx_active) + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); +} + +static void +e82545_rx_enable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 1; +} + +static void +e82545_rx_disable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 0; + while (sc->esc_rx_active) + pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); +} + +static void +e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) +{ + struct eth_uni *eu; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); + eu->eu_addrsel = (wval >> 16) & 0x3; + eu->eu_eth.octet[5] = wval >> 8; + eu->eu_eth.octet[4] = wval; + } else { + /* RAL */ + eu->eu_eth.octet[3] = wval >> 24; + eu->eu_eth.octet[2] = wval >> 16; + eu->eu_eth.octet[1] = wval >> 8; + eu->eu_eth.octet[0] = wval; + } +} + +static uint32_t +e82545_read_ra(struct e82545_softc *sc, int reg) +{ + struct eth_uni *eu; + uint32_t retval; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + retval = (eu->eu_valid << 31) | + (eu->eu_addrsel << 16) | + (eu->eu_eth.octet[5] << 8) | + eu->eu_eth.octet[4]; + } else { + /* RAL */ + retval = (eu->eu_eth.octet[3] << 24) | + (eu->eu_eth.octet[2] << 16) | + (eu->eu_eth.octet[1] << 8) | + eu->eu_eth.octet[0]; + } + + return (retval); +} + +static void +e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value); + return; + } + DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value); + + switch (offset) { + case E1000_CTRL: + case E1000_CTRL_DUP: + e82545_devctl(sc, value); + break; + case E1000_FCAL: + sc->esc_FCAL = value; + break; + case E1000_FCAH: + sc->esc_FCAH = value & ~0xFFFF0000; + break; + case E1000_FCT: + sc->esc_FCT = value & ~0xFFFF0000; + break; + case E1000_VET: + sc->esc_VET = value & ~0xFFFF0000; + break; + case E1000_FCTTV: + sc->esc_FCTTV = value & ~0xFFFF0000; + break; + case E1000_LEDCTL: + sc->esc_LEDCTL = value & ~0x30303000; + break; + case E1000_PBA: + sc->esc_PBA = value & 0x0000FF80; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + e82545_intr_write(sc, offset, value); + break; + case E1000_RCTL: + e82545_rx_ctl(sc, value); + break; + case E1000_FCRTL: + sc->esc_FCRTL = value & ~0xFFFF0007; + break; + case E1000_FCRTH: + sc->esc_FCRTH = value & ~0xFFFF0007; + break; + case E1000_RDBAL(0): + sc->esc_RDBAL = value & ~0xF; + if (sc->esc_rx_enabled) { + /* Apparently legal: update cached address */ + e82545_rx_update_rdba(sc); + } + break; + case E1000_RDBAH(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDBAH = value; + break; + case E1000_RDLEN(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDLEN = value & ~0xFFF0007F; + break; + case E1000_RDH(0): + /* XXX should only ever be zero ? Range check ? */ + sc->esc_RDH = value; + break; + case E1000_RDT(0): + /* XXX if this opens up the rx ring, do something ? */ + sc->esc_RDT = value; + break; + case E1000_RDTR: + /* ignore FPD bit 31 */ + sc->esc_RDTR = value & ~0xFFFF0000; + break; + case E1000_RXDCTL(0): + sc->esc_RXDCTL = value & ~0xFEC0C0C0; + break; + case E1000_RADV: + sc->esc_RADV = value & ~0xFFFF0000; + break; + case E1000_RSRPD: + sc->esc_RSRPD = value & ~0xFFFFF000; + break; + case E1000_RXCSUM: + sc->esc_RXCSUM = value & ~0xFFFFF800; + break; + case E1000_TXCW: + sc->esc_TXCW = value & ~0x3FFF0000; + break; + case E1000_TCTL: + e82545_tx_ctl(sc, value); + break; + case E1000_TIPG: + sc->esc_TIPG = value; + break; + case E1000_AIT: + sc->esc_AIT = value; + break; + case E1000_TDBAL(0): + sc->esc_TDBAL = value & ~0xF; + if (sc->esc_tx_enabled) { + /* Apparently legal */ + e82545_tx_update_tdba(sc); + } + break; + case E1000_TDBAH(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDBAH = value; + break; + case E1000_TDLEN(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDLEN = value & ~0xFFF0007F; + break; + case E1000_TDH(0): + //assert(!sc->esc_tx_enabled); + /* XXX should only ever be zero ? Range check ? */ + sc->esc_TDHr = sc->esc_TDH = value; + break; + case E1000_TDT(0): + /* XXX range check ? */ + sc->esc_TDT = value; + if (sc->esc_tx_enabled) + e82545_tx_start(sc); + break; + case E1000_TIDV: + sc->esc_TIDV = value & ~0xFFFF0000; + break; + case E1000_TXDCTL(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TXDCTL = value & ~0xC0C0C0; + break; + case E1000_TADV: + sc->esc_TADV = value & ~0xFFFF0000; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + e82545_write_ra(sc, ridx, value); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; + break; + case E1000_EECD: + { + //DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value); + /* edge triggered low->high */ + uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? + 0 : (value & E1000_EECD_SK)); + uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| + E1000_EECD_DI|E1000_EECD_REQ); + sc->eeprom_control &= ~eecd_mask; + sc->eeprom_control |= (value & eecd_mask); + /* grant/revoke immediately */ + if (value & E1000_EECD_REQ) { + sc->eeprom_control |= E1000_EECD_GNT; + } else { + sc->eeprom_control &= ~E1000_EECD_GNT; + } + if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { + e82545_eecd_strobe(sc); + } + return; + } + case E1000_MDIC: + { + uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> + E1000_MDIC_REG_SHIFT); + uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> + E1000_MDIC_PHY_SHIFT); + sc->mdi_control = + (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); + if ((value & E1000_MDIC_READY) != 0) { + DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value); + return; + } + switch (value & E82545_MDIC_OP_MASK) { + case E1000_MDIC_OP_READ: + sc->mdi_control &= ~E82545_MDIC_DATA_MASK; + sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); + break; + case E1000_MDIC_OP_WRITE: + e82545_write_mdi(sc, reg_addr, phy_addr, + value & E82545_MDIC_DATA_MASK); + break; + default: + DPRINTF("Unknown MDIC op: 0x%x\r\n", value); + return; + } + /* TODO: barrier? */ + sc->mdi_control |= E1000_MDIC_READY; + if (value & E82545_MDIC_IE) { + // TODO: generate interrupt + } + return; + } + case E1000_MANC: + case E1000_STATUS: + return; + default: + DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value); + return; + } +} + +static uint32_t +e82545_read_register(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register read offset:0x%x\r\n", offset); + return 0; + } + + DPRINTF("Register read: 0x%x\r\n", offset); + + switch (offset) { + case E1000_CTRL: + retval = sc->esc_CTRL; + break; + case E1000_STATUS: + retval = E1000_STATUS_FD | E1000_STATUS_LU | + E1000_STATUS_SPEED_1000; + break; + case E1000_FCAL: + retval = sc->esc_FCAL; + break; + case E1000_FCAH: + retval = sc->esc_FCAH; + break; + case E1000_FCT: + retval = sc->esc_FCT; + break; + case E1000_VET: + retval = sc->esc_VET; + break; + case E1000_FCTTV: + retval = sc->esc_FCTTV; + break; + case E1000_LEDCTL: + retval = sc->esc_LEDCTL; + break; + case E1000_PBA: + retval = sc->esc_PBA; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + retval = e82545_intr_read(sc, offset); + break; + case E1000_RCTL: + retval = sc->esc_RCTL; + break; + case E1000_FCRTL: + retval = sc->esc_FCRTL; + break; + case E1000_FCRTH: + retval = sc->esc_FCRTH; + break; + case E1000_RDBAL(0): + retval = sc->esc_RDBAL; + break; + case E1000_RDBAH(0): + retval = sc->esc_RDBAH; + break; + case E1000_RDLEN(0): + retval = sc->esc_RDLEN; + break; + case E1000_RDH(0): + retval = sc->esc_RDH; + break; + case E1000_RDT(0): + retval = sc->esc_RDT; + break; + case E1000_RDTR: + retval = sc->esc_RDTR; + break; + case E1000_RXDCTL(0): + retval = sc->esc_RXDCTL; + break; + case E1000_RADV: + retval = sc->esc_RADV; + break; + case E1000_RSRPD: + retval = sc->esc_RSRPD; + break; + case E1000_RXCSUM: + retval = sc->esc_RXCSUM; + break; + case E1000_TXCW: + retval = sc->esc_TXCW; + break; + case E1000_TCTL: + retval = sc->esc_TCTL; + break; + case E1000_TIPG: + retval = sc->esc_TIPG; + break; + case E1000_AIT: + retval = sc->esc_AIT; + break; + case E1000_TDBAL(0): + retval = sc->esc_TDBAL; + break; + case E1000_TDBAH(0): + retval = sc->esc_TDBAH; + break; + case E1000_TDLEN(0): + retval = sc->esc_TDLEN; + break; + case E1000_TDH(0): + retval = sc->esc_TDH; + break; + case E1000_TDT(0): + retval = sc->esc_TDT; + break; + case E1000_TIDV: + retval = sc->esc_TIDV; + break; + case E1000_TXDCTL(0): + retval = sc->esc_TXDCTL; + break; + case E1000_TADV: + retval = sc->esc_TADV; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + retval = e82545_read_ra(sc, ridx); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; + break; + case E1000_EECD: + //DPRINTF("EECD read %x\r\n", sc->eeprom_control); + retval = sc->eeprom_control; + break; + case E1000_MDIC: + retval = sc->mdi_control; + break; + case E1000_MANC: + retval = 0; + break; + /* stats that we emulate. */ + case E1000_MPC: + retval = sc->missed_pkt_count; + break; + case E1000_PRC64: + retval = sc->pkt_rx_by_size[0]; + break; + case E1000_PRC127: + retval = sc->pkt_rx_by_size[1]; + break; + case E1000_PRC255: + retval = sc->pkt_rx_by_size[2]; + break; + case E1000_PRC511: + retval = sc->pkt_rx_by_size[3]; + break; + case E1000_PRC1023: + retval = sc->pkt_rx_by_size[4]; + break; + case E1000_PRC1522: + retval = sc->pkt_rx_by_size[5]; + break; + case E1000_GPRC: + retval = sc->good_pkt_rx_count; + break; + case E1000_BPRC: + retval = sc->bcast_pkt_rx_count; + break; + case E1000_MPRC: + retval = sc->mcast_pkt_rx_count; + break; + case E1000_GPTC: + case E1000_TPT: + retval = sc->good_pkt_tx_count; + break; + case E1000_GORCL: + retval = (uint32_t)sc->good_octets_rx; + break; + case E1000_GORCH: + retval = (uint32_t)(sc->good_octets_rx >> 32); + break; + case E1000_TOTL: + case E1000_GOTCL: + retval = (uint32_t)sc->good_octets_tx; + break; + case E1000_TOTH: + case E1000_GOTCH: + retval = (uint32_t)(sc->good_octets_tx >> 32); + break; + case E1000_ROC: + retval = sc->oversize_rx_count; + break; + case E1000_TORL: + retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); + break; + case E1000_TORH: + retval = (uint32_t)((sc->good_octets_rx + + sc->missed_octets) >> 32); + break; + case E1000_TPR: + retval = sc->good_pkt_rx_count + sc->missed_pkt_count + + sc->oversize_rx_count; + break; + case E1000_PTC64: + retval = sc->pkt_tx_by_size[0]; + break; + case E1000_PTC127: + retval = sc->pkt_tx_by_size[1]; + break; + case E1000_PTC255: + retval = sc->pkt_tx_by_size[2]; + break; + case E1000_PTC511: + retval = sc->pkt_tx_by_size[3]; + break; + case E1000_PTC1023: + retval = sc->pkt_tx_by_size[4]; + break; + case E1000_PTC1522: + retval = sc->pkt_tx_by_size[5]; + break; + case E1000_MPTC: + retval = sc->mcast_pkt_tx_count; + break; + case E1000_BPTC: + retval = sc->bcast_pkt_tx_count; + break; + case E1000_TSCTC: + retval = sc->tso_tx_count; + break; + /* stats that are always 0. */ + case E1000_CRCERRS: + case E1000_ALGNERRC: + case E1000_SYMERRS: + case E1000_RXERRC: + case E1000_SCC: + case E1000_ECOL: + case E1000_MCC: + case E1000_LATECOL: + case E1000_COLC: + case E1000_DC: + case E1000_TNCRS: + case E1000_SEC: + case E1000_CEXTERR: + case E1000_RLEC: + case E1000_XONRXC: + case E1000_XONTXC: + case E1000_XOFFRXC: + case E1000_XOFFTXC: + case E1000_FCRUC: + case E1000_RNBC: + case E1000_RUC: + case E1000_RFC: + case E1000_RJC: + case E1000_MGTPRC: + case E1000_MGTPDC: + case E1000_MGTPTC: + case E1000_TSCTFC: + retval = 0; + break; + default: + DPRINTF("Unknown read register: 0x%x\r\n", offset); + retval = 0; + break; + } + + return (retval); +} + +static void +e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct e82545_softc *sc; + + //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size); + + sc = pi->pi_arg; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value); + } else + sc->io_addr = (uint32_t)value; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value); + } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value); + } else + e82545_write_register(sc, sc->io_addr, + (uint32_t)value); + break; + default: + DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value); + } else + e82545_write_register(sc, (uint32_t)offset, + (uint32_t)value); + break; + default: + DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n", + baridx, offset, value, size); + } + + pthread_mutex_unlock(&sc->esc_mtx); +} + +static uint64_t +e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct e82545_softc *sc; + uint64_t retval; + + //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); + sc = pi->pi_arg; + retval = 0; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr read sz:%d\r\n", size); + } else + retval = sc->io_addr; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data read sz:%d\r\n", size); + } + if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io read addr:0x%x\r\n", + sc->io_addr); + } else + retval = e82545_read_register(sc, sc->io_addr); + break; + default: + DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n", + offset, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register read size:%d offset:0x%lx\r\n", + size, offset); + } else + retval = e82545_read_register(sc, (uint32_t)offset); + break; + default: + DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n", + baridx, offset, size); + break; + } + + pthread_mutex_unlock(&sc->esc_mtx); + + return (retval); +} + +static void +e82545_reset(struct e82545_softc *sc, int drvr) +{ + int i; + + e82545_rx_disable(sc); + e82545_tx_disable(sc); + + /* clear outstanding interrupts */ + if (sc->esc_irq_asserted) + pci_lintr_deassert(sc->esc_pi); + + /* misc */ + if (!drvr) { + sc->esc_FCAL = 0; + sc->esc_FCAH = 0; + sc->esc_FCT = 0; + sc->esc_VET = 0; + sc->esc_FCTTV = 0; + } + sc->esc_LEDCTL = 0x07061302; + sc->esc_PBA = 0x00100030; + + /* start nvm in opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; + e82545_init_eeprom(sc); + + /* interrupt */ + sc->esc_ICR = 0; + sc->esc_ITR = 250; + sc->esc_ICS = 0; + sc->esc_IMS = 0; + sc->esc_IMC = 0; + + /* L2 filters */ + if (!drvr) { + memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); + memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); + memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); + + /* XXX not necessary on 82545 ?? */ + sc->esc_uni[0].eu_valid = 1; + memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, + ETHER_ADDR_LEN); + } else { + /* Clear RAH valid bits */ + for (i = 0; i < 16; i++) + sc->esc_uni[i].eu_valid = 0; + } + + /* receive */ + if (!drvr) { + sc->esc_RDBAL = 0; + sc->esc_RDBAH = 0; + } + sc->esc_RCTL = 0; + sc->esc_FCRTL = 0; + sc->esc_FCRTH = 0; + sc->esc_RDLEN = 0; + sc->esc_RDH = 0; + sc->esc_RDT = 0; + sc->esc_RDTR = 0; + sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ + sc->esc_RADV = 0; + sc->esc_RXCSUM = 0; + + /* transmit */ + if (!drvr) { + sc->esc_TDBAL = 0; + sc->esc_TDBAH = 0; + sc->esc_TIPG = 0; + sc->esc_AIT = 0; + sc->esc_TIDV = 0; + sc->esc_TADV = 0; + } + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + sc->esc_TXCW = 0; + sc->esc_TCTL = 0; + sc->esc_TDLEN = 0; + sc->esc_TDT = 0; + sc->esc_TDHr = sc->esc_TDH = 0; + sc->esc_TXDCTL = 0; +} + +static void +e82545_open_tap(struct e82545_softc *sc, char *opts) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (opts == NULL) { + sc->esc_tapfd = -1; + return; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, opts, sizeof(tbuf)); + + sc->esc_tapfd = open(tbuf, O_RDWR); + if (sc->esc_tapfd == -1) { + DPRINTF("unable to open tap device %s\n", opts); + exit(4); + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF("tap device O_NONBLOCK failed: %d\n", errno); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->esc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + +#ifdef __FreeBSD__ + sc->esc_mevp = mevent_add(sc->esc_tapfd, + EVF_READ, + e82545_tap_callback, + sc); + if (sc->esc_mevp == NULL) { + DPRINTF("Could not register mevent %d\n", EVF_READ); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } +#endif +} + +static int +e82545_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (1); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + return (0); +} + +static int +e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + DPRINTF("Loading with options: %s\r\n", opts); + + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct e82545_softc *sc; + char *devname; + char *vtopts; + int mac_provided; + + /* Setup our softc */ + sc = calloc(1, sizeof(*sc)); + + pi->pi_arg = sc; + sc->esc_pi = pi; + sc->esc_ctx = ctx; + + pthread_mutex_init(&sc->esc_mtx, NULL); + pthread_cond_init(&sc->esc_rx_cond, NULL); + pthread_cond_init(&sc->esc_tx_cond, NULL); + pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); + snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->esc_tx_tid, nstr); + + pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); + pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); + + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); + + /* TODO: this card also supports msi, but the freebsd driver for it + * does not, so I have not implemented it. */ + pci_lintr_request(pi); + + pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, + E82545_BAR_REGISTER_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, + E82545_BAR_FLASH_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, + E82545_BAR_IO_LEN); + + /* + * Attempt to open the tap device and read the MAC address + * if specified. Copied from virtio-net, slightly modified. + */ + mac_provided = 0; + sc->esc_tapfd = -1; + if (opts != NULL) { + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = e82545_parsemac(vtopts, sc->esc_mac.octet); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } + + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + e82545_open_tap(sc, devname); + + free(devname); + } + + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->esc_mac.octet[0] = 0x00; + sc->esc_mac.octet[1] = 0xa0; + sc->esc_mac.octet[2] = 0x98; + sc->esc_mac.octet[3] = digest[0]; + sc->esc_mac.octet[4] = digest[1]; + sc->esc_mac.octet[5] = digest[2]; + } + + /* H/w initiated reset */ + e82545_reset(sc, 0); + + return (0); +} + +struct pci_devemu pci_de_e82545 = { + .pe_emu = "e1000", + .pe_init = e82545_init, + .pe_barwrite = e82545_write, + .pe_barread = e82545_read +}; +PCI_EMUL_SET(pci_de_e82545); + diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c new file mode 100644 index 0000000000..03db632e37 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -0,0 +1,2141 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> + +#include <ctype.h> +#include <errno.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <assert.h> +#include <stdbool.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "bhyverun.h" +#include "inout.h" +#include "ioapic.h" +#include "mem.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc + +#define CONF1_ENABLE 0x80000000ul + +#define MAXBUSES (PCI_BUSMAX + 1) +#define MAXSLOTS (PCI_SLOTMAX + 1) +#define MAXFUNCS (PCI_FUNCMAX + 1) + +struct funcinfo { + char *fi_name; + char *fi_param; + struct pci_devinst *fi_devi; +}; + +struct intxinfo { + int ii_count; + int ii_pirq_pin; + int ii_ioapic_irq; +}; + +struct slotinfo { + struct intxinfo si_intpins[4]; + struct funcinfo si_funcs[MAXFUNCS]; +}; + +struct businfo { + uint16_t iobase, iolimit; /* I/O window */ + uint32_t membase32, memlimit32; /* mmio window below 4GB */ + uint64_t membase64, memlimit64; /* mmio window above 4GB */ + struct slotinfo slotinfo[MAXSLOTS]; +}; + +static struct businfo *pci_businfo[MAXBUSES]; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static struct pci_devemu *pci_emul_finddev(char *name); +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); + +static __inline void +CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) +{ + + if (bytes == 1) + pci_set_cfgdata8(pi, coff, val); + else if (bytes == 2) + pci_set_cfgdata16(pi, coff, val); + else + pci_set_cfgdata32(pi, coff, val); +} + +static __inline uint32_t +CFGREAD(struct pci_devinst *pi, int coff, int bytes) +{ + + if (bytes == 1) + return (pci_get_cfgdata8(pi, coff)); + else if (bytes == 2) + return (pci_get_cfgdata16(pi, coff)); + else + return (pci_get_cfgdata32(pi, coff)); +} + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * <bus>:<slot>:<func>,<emul>[,<config>] + * <slot>[:<func>],<emul>[,<config>] + * + * slot is 0..31 + * func is 0..7 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3:0,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + + fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); +} + +int +pci_parse_slot(char *opt) +{ + struct businfo *bi; + struct slotinfo *si; + char *emul, *config, *str, *cp; + int error, bnum, snum, fnum; + + error = -1; + str = strdup(opt); + + emul = config = NULL; + if ((cp = strchr(str, ',')) != NULL) { + *cp = '\0'; + emul = cp + 1; + if ((cp = strchr(emul, ',')) != NULL) { + *cp = '\0'; + config = cp + 1; + } + } else { + pci_parse_slot_usage(opt); + goto done; + } + + /* <bus>:<slot>:<func> */ + if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { + bnum = 0; + /* <slot>:<func> */ + if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { + fnum = 0; + /* <slot> */ + if (sscanf(str, "%d", &snum) != 1) { + snum = -1; + } + } + } + + if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || + fnum < 0 || fnum >= MAXFUNCS) { + pci_parse_slot_usage(opt); + goto done; + } + + if (pci_businfo[bnum] == NULL) + pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); + + bi = pci_businfo[bnum]; + si = &bi->slotinfo[snum]; + + if (si->si_funcs[fnum].fi_name != NULL) { + fprintf(stderr, "pci slot %d:%d already occupied!\n", + snum, fnum); + goto done; + } + + if (pci_emul_finddev(emul) == NULL) { + fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", + snum, fnum, emul); + goto done; + } + + error = 0; + si->si_funcs[fnum].fi_name = emul; + si->si_funcs[fnum].fi_param = config; + +done: + if (error) + free(str); + + return (error); +} + +void +pci_print_supported_devices() +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + printf("%s\n", pdp->pe_emu); + } +} + +static int +pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) +{ + + if (offset < pi->pi_msix.pba_offset) + return (0); + + if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + return (0); + } + + return (1); +} + +int +pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value) +{ + int msix_entry_offset; + int tab_index; + char *dest; + + /* support only 4 or 8 byte writes */ + if (size != 4 && size != 8) + return (-1); + + /* + * Return if table index is beyond what device supports + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + if (tab_index >= pi->pi_msix.table_count) + return (-1); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned writes */ + if ((msix_entry_offset % size) != 0) + return (-1); + + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + *((uint32_t *)dest) = value; + else + *((uint64_t *)dest) = value; + + return (0); +} + +uint64_t +pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) +{ + char *dest; + int msix_entry_offset; + int tab_index; + uint64_t retval = ~0; + + /* + * The PCI standard only allows 4 and 8 byte accesses to the MSI-X + * table but we also allow 1 byte access to accommodate reads from + * ddb. + */ + if (size != 1 && size != 4 && size != 8) + return (retval); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned reads */ + if ((msix_entry_offset % size) != 0) { + return (retval); + } + + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + if (tab_index < pi->pi_msix.table_count) { + /* valid MSI-X Table access */ + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 1) + retval = *((uint8_t *)dest); + else if (size == 4) + retval = *((uint32_t *)dest); + else + retval = *((uint64_t *)dest); + } else if (pci_valid_pba_offset(pi, offset)) { + /* return 0 for PBA access */ + retval = 0; + } + + return (retval); +} + +int +pci_msix_table_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.table_bar); + else + return (-1); +} + +int +pci_msix_pba_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.pba_bar); + else + return (-1); +} + +static int +pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if (pdi->pi_bar[i].type == PCIBAR_IO && + port >= pdi->pi_bar[i].addr && + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + offset = port - pdi->pi_bar[i].addr; + if (in) + *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, + offset, bytes); + else + (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, + bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct pci_devinst *pdi = arg1; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(bidx <= PCI_BARMAX); + assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || + pdi->pi_bar[bidx].type == PCIBAR_MEM64); + assert(addr >= pdi->pi_bar[bidx].addr && + addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); + + offset = addr - pdi->pi_bar[bidx].addr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + 4, *val & 0xffffffff); + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, + 4, *val >> 32); + } else { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + size, *val); + } + } else { + if (size == 8) { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, 4); + *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset + 4, 4) << 32; + } else { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, size); + } + } + + return (0); +} + + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) +{ + + return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); +} + +/* + * Register (or unregister) the MMIO or I/O region associated with the BAR + * register 'idx' of an emulated pci device. + */ +static void +modify_bar_registration(struct pci_devinst *pi, int idx, int registration) +{ + int error; + struct inout_port iop; + struct mem_range mr; + + switch (pi->pi_bar[idx].type) { + case PCIBAR_IO: + bzero(&iop, sizeof(struct inout_port)); + iop.name = pi->pi_name; + iop.port = pi->pi_bar[idx].addr; + iop.size = pi->pi_bar[idx].size; + if (registration) { + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_io_handler; + iop.arg = pi; + error = register_inout(&iop); + } else + error = unregister_inout(&iop); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + bzero(&mr, sizeof(struct mem_range)); + mr.name = pi->pi_name; + mr.base = pi->pi_bar[idx].addr; + mr.size = pi->pi_bar[idx].size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = pci_emul_mem_handler; + mr.arg1 = pi; + mr.arg2 = idx; + error = register_mem(&mr); + } else + error = unregister_mem(&mr); + break; + default: + error = EINVAL; + break; + } + assert(error == 0); +} + +static void +unregister_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 0); +} + +static void +register_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 1); +} + +/* Are we decoding i/o port accesses for the emulated pci device? */ +static int +porten(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_PORTEN); +} + +/* Are we decoding memory accesses for the emulated pci device? */ +static int +memen(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_MEMEN); +} + +/* + * Update the MMIO or I/O address that is decoded by the BAR register. + * + * If the pci device has enabled the address space decoding then intercept + * the address range decoded by the BAR register. + */ +static void +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +{ + int decode; + + if (pi->pi_bar[idx].type == PCIBAR_IO) + decode = porten(pi); + else + decode = memen(pi); + + if (decode) + unregister_bar(pi, idx); + + switch (type) { + case PCIBAR_IO: + case PCIBAR_MEM32: + pi->pi_bar[idx].addr = addr; + break; + case PCIBAR_MEM64: + pi->pi_bar[idx].addr &= ~0xffffffffUL; + pi->pi_bar[idx].addr |= addr; + break; + case PCIBAR_MEMHI64: + pi->pi_bar[idx].addr &= 0xffffffff; + pi->pi_bar[idx].addr |= addr; + break; + default: + assert(0); + } + + if (decode) + register_bar(pi, idx); +} + +int +pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + uint64_t *baseptr = NULL; + uint64_t limit = 0, lobits = 0; + uint64_t addr, mask, bar; + int error; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + if ((size & (size - 1)) != 0) + size = 1UL << flsl(size); /* round up to a power of 2 */ + + /* Enforce minimum BAR sizes required by the PCI standard */ + if (type == PCIBAR_IO) { + if (size < 4) + size = 4; + } else { + if (size < 16) + size = 16; + } + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } else { + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; + } + break; + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + default: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); +#ifdef FreeBSD + assert(0); +#else + abort(); +#endif + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + register_bar(pdi, idx); + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, reallen; + uint16_t sts; + + assert(caplen > 0); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) + capoff = CAP_START_OFFSET; + else + capoff = pi->pi_capend + 1; + + /* Check if we have enough space */ + if (capoff + reallen > PCI_REGMAX + 1) + return (-1); + + /* Set the previous capability pointer */ + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else + pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, 0); + + pi->pi_prevcap = capoff; + pi->pi_capend = capoff + reallen - 1; + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static int +pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, + int func, struct funcinfo *fi) +{ + struct pci_devinst *pdi; + int err; + + pdi = calloc(1, sizeof(struct pci_devinst)); + + pdi->pi_vmctx = ctx; + pdi->pi_bus = bus; + pdi->pi_slot = slot; + pdi->pi_func = func; + pthread_mutex_init(&pdi->pi_lintr.lock, NULL); + pdi->pi_lintr.pin = 0; + pdi->pi_lintr.state = IDLE; + pdi->pi_lintr.pirq_pin = 0; + pdi->pi_lintr.ioapic_irq = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + err = (*pde->pe_init)(ctx, pdi, fi->fi_param); + if (err == 0) + fi->fi_devi = pdi; + else + free(pdi); + + return (err); +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = nextptr; + msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +static void +pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, + uint32_t msix_tab_size) +{ + + assert(msix_tab_size % 4096 == 0); + + bzero(msixcap, sizeof(struct msixcap)); + msixcap->capid = PCIY_MSIX; + + /* + * Message Control Register, all fields set to + * zero except for the Table Size. + * Note: Table size N is encoded as N-1 + */ + msixcap->msgctrl = msgnum - 1; + + /* + * MSI-X BAR setup: + * - MSI-X table start at offset 0 + * - PBA table starts at a 4K aligned offset after the MSI-X table + */ + msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; + msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); +} + +static void +pci_msix_table_init(struct pci_devinst *pi, int table_entries) +{ + int i, table_size; + + assert(table_entries > 0); + assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); + + table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* set mask bit of vector control register */ + for (i = 0; i < table_entries; i++) + pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; +} + +int +pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) +{ + uint32_t tab_size; + struct msixcap msixcap; + + assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); + assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); + + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; + + /* Align table size to nearest 4K */ + tab_size = roundup2(tab_size, 4096); + + pi->pi_msix.table_bar = barnum; + pi->pi_msix.pba_bar = barnum; + pi->pi_msix.table_offset = 0; + pi->pi_msix.table_count = msgnum; + pi->pi_msix.pba_offset = tab_size; + pi->pi_msix.pba_size = PBA_SIZE(msgnum); + + pci_msix_table_init(pi, msgnum); + + pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); + + /* allocate memory for MSI-X Table and PBA */ + pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, + tab_size + pi->pi_msix.pba_size); + + return (pci_emul_add_capability(pi, (u_char *)&msixcap, + sizeof(msixcap))); +} + +void +msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask; + int off; + + off = offset - capoff; + /* Message Control Register */ + if (off == 2 && bytes == 2) { + rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.addr = addrlo; + pi->pi_msi.msg_data = msgdata; + pi->pi_msi.maxmsgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.maxmsgnum = 0; + } + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + + /* XXX don't write to the readonly parts */ + CFGWRITE(pi, offset, val, bytes); +} + +#define PCIECAP_VERSION 0x2 +int +pci_emul_add_pciecap(struct pci_devinst *pi, int type) +{ + int err; + struct pciecap pciecap; + + if (type != PCIEM_TYPE_ROOT_PORT) + return (-1); + + bzero(&pciecap, sizeof(pciecap)); + + pciecap.capid = PCIY_EXPRESS; + pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT; + pciecap.link_capabilities = 0x411; /* gen1, x1 */ + pciecap.link_status = 0x11; /* gen1, x1 */ + + err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); + return (err); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly. + * However, some o/s's do 4-byte writes that include these. + * For this case, trim the write back to 2 bytes and adjust + * the data. + */ + if (offset == capoff || offset == capoff + 1) { + if (offset == capoff && bytes == 4) { + bytes = 2; + offset += 2; + val >>= 16; + } else + return; + } + + capid = pci_get_cfgdata8(pi, capoff); + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_MSIX: + msixcap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_EXPRESS: + pciecap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + uint16_t sts; + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) + return (1); + } + return (0); +} + +static int +pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + /* + * Ignore writes; return 0xff's for reads. The mem read code + * will take care of truncating to the correct size. + */ + if (dir == MEM_F_READ) { + *val = 0xffffffffffffffff; + } + + return (0); +} + +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + +#define BUSIO_ROUNDUP 32 +#define BUSMEM_ROUNDUP (1024 * 1024) + +int +init_pci(struct vmctx *ctx) +{ + struct mem_range mr; + struct pci_devemu *pde; + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + size_t lowmem; + int bus, slot, func; + int error; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = vm_get_lowmem_limit(ctx); + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + /* + * Keep track of the i/o and memory resources allocated to + * this bus. + */ + bi->iobase = pci_emul_iobase; + bi->membase32 = pci_emul_membase32; + bi->membase64 = pci_emul_membase64; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + pde = pci_emul_finddev(fi->fi_name); + assert(pde != NULL); + error = pci_emul_init(ctx, pde, bus, slot, + func, fi); + if (error) + return (error); + } + } + + /* + * Add some slop to the I/O and memory resources decoded by + * this bus to give a guest some flexibility if it wants to + * reprogram the BARs. + */ + pci_emul_iobase += BUSIO_ROUNDUP; + pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); + bi->iolimit = pci_emul_iobase; + + pci_emul_membase32 += BUSMEM_ROUNDUP; + pci_emul_membase32 = roundup2(pci_emul_membase32, + BUSMEM_ROUNDUP); + bi->memlimit32 = pci_emul_membase32; + + pci_emul_membase64 += BUSMEM_ROUNDUP; + pci_emul_membase64 = roundup2(pci_emul_membase64, + BUSMEM_ROUNDUP); + bi->memlimit64 = pci_emul_membase64; + } + + /* + * PCI backends are initialized before routing INTx interrupts + * so that LPC devices are able to reserve ISA IRQs before + * routing PIRQ pins. + */ + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_devi == NULL) + continue; + pci_lintr_route(fi->fi_devi); + } + } + } + lpc_pirq_routed(); + + /* + * The guest physical memory map looks like the following: + * [0, lowmem) guest system memory + * [lowmem, lowmem_limit) memory hole (may be absent) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware + * [4GB, 4GB + highmem) + */ + + /* + * Accesses to memory addresses that are not allocated to system + * memory or PCI devices return 0xff's. + */ + lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); + + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); + assert(error == 0); + + return (0); +} + +static void +pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" Zero,"); + dsdt_line(" 0x%X", ioapic_irq); + dsdt_line(" },"); +} + +static void +pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + char *name; + + name = lpc_pirq_name(pirq_pin); + if (name == NULL) + return; + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" %s,", name); + dsdt_line(" 0x00"); + dsdt_line(" },"); + free(name); +} + +/* + * A bhyve virtual machine has a flat PCI hierarchy with a root port + * corresponding to each PCI bus. + */ +static void +pci_bus_write_dsdt(int bus) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + int count, func, slot; + + /* + * If there are no devices on this 'bus' then just return. + */ + if ((bi = pci_businfo[bus]) == NULL) { + /* + * Bus 0 is special because it decodes the I/O ports used + * for PCI config space access even if there are no devices + * on it. + */ + if (bus != 0) + return; + } + + dsdt_line(" Device (PC%02X)", bus); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); + dsdt_line(" Name (_ADR, Zero)"); + + dsdt_line(" Method (_BBN, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Return (0x%08X)", bus); + dsdt_line(" }"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " + "MaxFixed, PosDecode,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bus); + dsdt_line(" 0x%04X, // Range Maximum", bus); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0001, // Length"); + dsdt_line(" ,, )"); + + if (bus == 0) { + dsdt_indent(3); + dsdt_fixed_ioport(0xCF8, 8); + dsdt_unindent(3); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0000, // Range Minimum"); + dsdt_line(" 0x0CF7, // Range Maximum"); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0CF8, // Length"); + dsdt_line(" ,, , TypeStatic)"); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0D00, // Range Minimum"); + dsdt_line(" 0x%04X, // Range Maximum", + PCI_EMUL_IOBASE - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + PCI_EMUL_IOBASE - 0x0D00); + dsdt_line(" ,, , TypeStatic)"); + + if (bi == NULL) { + dsdt_line(" })"); + goto done; + } + } + assert(bi != NULL); + + /* i/o window */ + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); + dsdt_line(" 0x%04X, // Range Maximum", + bi->iolimit - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + bi->iolimit - bi->iobase); + dsdt_line(" ,, , TypeStatic)"); + + /* mmio window (32-bit) */ + dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x00000000, // Granularity"); + dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); + dsdt_line(" 0x%08X, // Range Maximum\n", + bi->memlimit32 - 1); + dsdt_line(" 0x00000000, // Translation Offset"); + dsdt_line(" 0x%08X, // Length\n", + bi->memlimit32 - bi->membase32); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + + /* mmio window (64-bit) */ + dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x0000000000000000, // Granularity"); + dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); + dsdt_line(" 0x%016lX, // Range Maximum\n", + bi->memlimit64 - 1); + dsdt_line(" 0x0000000000000000, // Translation Offset"); + dsdt_line(" 0x%016lX, // Length\n", + bi->memlimit64 - bi->membase64); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + dsdt_line(" })"); + + count = pci_count_lintr(bus); + if (count != 0) { + dsdt_indent(2); + dsdt_line("Name (PPRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Name (APRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_apic_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Method (_PRT, 0, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (PICM)"); + dsdt_line(" {"); + dsdt_line(" Return (APRT)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (PPRT)"); + dsdt_line(" }"); + dsdt_line("}"); + dsdt_unindent(2); + } + + dsdt_indent(2); + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + pi = si->si_funcs[func].fi_devi; + if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) + pi->pi_d->pe_write_dsdt(pi); + } + } + dsdt_unindent(2); +done: + dsdt_line(" }"); +} + +void +pci_write_dsdt(void) +{ + int bus; + + dsdt_indent(1); + dsdt_line("Name (PICM, 0x00)"); + dsdt_line("Method (_PIC, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" Store (Arg0, PICM)"); + dsdt_line("}"); + dsdt_line(""); + dsdt_line("Scope (_SB)"); + dsdt_line("{"); + for (bus = 0; bus < MAXBUSES; bus++) + pci_bus_write_dsdt(bus); + dsdt_line("}"); + dsdt_unindent(1); +} + +int +pci_bus_configured(int bus) +{ + assert(bus >= 0 && bus < MAXBUSES); + return (pci_businfo[bus] != NULL); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +int +pci_msi_maxmsgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.maxmsgnum); + else + return (0); +} + +int +pci_msix_enabled(struct pci_devinst *pi) +{ + + return (pi->pi_msix.enabled && !pi->pi_msi.enabled); +} + +void +pci_generate_msix(struct pci_devinst *pi, int index) +{ + struct msix_table_entry *mte; + + if (!pci_msix_enabled(pi)) + return; + + if (pi->pi_msix.function_mask) + return; + + if (index >= pi->pi_msix.table_count) + return; + + mte = &pi->pi_msix.table[index]; + if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* XXX Set PBA bit if interrupt is disabled */ + vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); + } +} + +void +pci_generate_msi(struct pci_devinst *pi, int index) +{ + + if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { + vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, + pi->pi_msi.msg_data + index); + } +} + +static bool +pci_lintr_permitted(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || + (cmd & PCIM_CMD_INTxDIS))); +} + +void +pci_lintr_request(struct pci_devinst *pi) +{ + struct businfo *bi; + struct slotinfo *si; + int bestpin, bestcount, pin; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + + /* + * Just allocate a pin from our slot. The pin will be + * assigned IRQs later when interrupts are routed. + */ + si = &bi->slotinfo[pi->pi_slot]; + bestpin = 0; + bestcount = si->si_intpins[0].ii_count; + for (pin = 1; pin < 4; pin++) { + if (si->si_intpins[pin].ii_count < bestcount) { + bestpin = pin; + bestcount = si->si_intpins[pin].ii_count; + } + } + + si->si_intpins[bestpin].ii_count++; + pi->pi_lintr.pin = bestpin + 1; + pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); +} + +static void +pci_lintr_route(struct pci_devinst *pi) +{ + struct businfo *bi; + struct intxinfo *ii; + + if (pi->pi_lintr.pin == 0) + return; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; + + /* + * Attempt to allocate an I/O APIC pin for this intpin if one + * is not yet assigned. + */ + if (ii->ii_ioapic_irq == 0) + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); + assert(ii->ii_ioapic_irq > 0); + + /* + * Attempt to allocate a PIRQ pin for this intpin if one is + * not yet assigned. + */ + if (ii->ii_pirq_pin == 0) + ii->ii_pirq_pin = pirq_alloc_pin(pi); + assert(ii->ii_pirq_pin > 0); + + pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; + pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; + pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); +} + +void +pci_lintr_assert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == IDLE) { + if (pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } else + pi->pi_lintr.state = PENDING; + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +void +pci_lintr_deassert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED) { + pi->pi_lintr.state = IDLE; + pci_irq_deassert(pi); + } else if (pi->pi_lintr.state == PENDING) + pi->pi_lintr.state = IDLE; + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +static void +pci_lintr_update(struct pci_devinst *pi) +{ + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { + pci_irq_deassert(pi); + pi->pi_lintr.state = PENDING; + } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +#ifndef __FreeBSD__ + if (pi->pi_d->pe_lintrupdate != NULL) { + pi->pi_d->pe_lintrupdate(pi); + } +#endif /* __FreeBSD__ */ +} + +int +pci_count_lintr(int bus) +{ + int count, slot, pin; + struct slotinfo *slotinfo; + + count = 0; + if (pci_businfo[bus] != NULL) { + for (slot = 0; slot < MAXSLOTS; slot++) { + slotinfo = &pci_businfo[bus]->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + if (slotinfo->si_intpins[pin].ii_count != 0) + count++; + } + } + } + return (count); +} + +void +pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) +{ + struct businfo *bi; + struct slotinfo *si; + struct intxinfo *ii; + int slot, pin; + + if ((bi = pci_businfo[bus]) == NULL) + return; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + ii = &si->si_intpins[pin]; + if (ii->ii_count != 0) + cb(bus, slot, pin + 1, ii->ii_pirq_pin, + ii->ii_ioapic_irq, arg); + } + } +} + +/* + * Return 1 if the emulated device in 'slot' is a multi-function device. + * Return 0 otherwise. + */ +static int +pci_emul_is_mfdev(int bus, int slot) +{ + struct businfo *bi; + struct slotinfo *si; + int f, numfuncs; + + numfuncs = 0; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + for (f = 0; f < MAXFUNCS; f++) { + if (si->si_funcs[f].fi_devi != NULL) { + numfuncs++; + } + } + } + return (numfuncs > 1); +} + +/* + * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on + * whether or not is a multi-function being emulated in the pci 'slot'. + */ +static void +pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) +{ + int mfdev; + + if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { + mfdev = pci_emul_is_mfdev(bus, slot); + switch (bytes) { + case 1: + case 2: + *rv &= ~PCIM_MFDEV; + if (mfdev) { + *rv |= PCIM_MFDEV; + } + break; + case 4: + *rv &= ~(PCIM_MFDEV << 16); + if (mfdev) { + *rv |= (PCIM_MFDEV << 16); + } + break; + } + } +} + +static void +pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) +{ + int i, rshift; + uint32_t cmd, cmd2, changed, old, readonly; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ + + /* + * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. + * + * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are + * 'write 1 to clear'. However these bits are not set to '1' by + * any device emulation so it is simpler to treat them as readonly. + */ + rshift = (coff & 0x3) * 8; + readonly = 0xFFFFF880 >> rshift; + + old = CFGREAD(pi, coff, bytes); + new &= ~readonly; + new |= (old & readonly); + CFGWRITE(pi, coff, new, bytes); /* update config */ + + cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + changed = cmd ^ cmd2; + + /* + * If the MMIO or I/O address space decoding has changed then + * register/unregister all BARs that decode that address space. + */ + for (i = 0; i <= PCI_BARMAX; i++) { + switch (pi->pi_bar[i].type) { + case PCIBAR_NONE: + case PCIBAR_MEMHI64: + break; + case PCIBAR_IO: + /* I/O address space decoding changed? */ + if (changed & PCIM_CMD_PORTEN) { + if (porten(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + /* MMIO address space decoding changed? */ + if (changed & PCIM_CMD_MEMEN) { + if (memen(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + default: + assert(0); + } + } + + /* + * If INTx has been unmasked and is pending, assert the + * interrupt. + */ + pci_lintr_update(pi); +} + +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + struct pci_devemu *pe; + int idx, needcfg; + uint64_t addr, mask; + uint64_t bar = 0; + + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; + } else + pi = NULL; + + /* + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. + */ + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { + if (in) + *eax = 0xffffffff; + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL) { + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); + } else { + needcfg = 1; + } + + if (needcfg) + *eax = CFGREAD(pi, coff, bytes); + + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) + return; + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return; + idx = (coff - PCIR_BAR(0)) / 4; + mask = ~(pi->pi_bar[idx].size - 1); + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + pi->pi_bar[idx].addr = bar = 0; + break; + case PCIBAR_IO: + addr = *eax & mask; + addr &= 0xffff; + bar = addr | PCIM_BAR_IO_SPACE; + /* + * Register the new BAR value for interception + */ + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_IO); + } + break; + case PCIBAR_MEM32: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM32); + } + break; + case PCIBAR_MEM64: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + if (addr != (uint32_t)pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM64); + } + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + addr = ((uint64_t)*eax << 32) & mask; + bar = addr >> 32; + if (bar != pi->pi_bar[idx - 1].addr >> 32) { + update_bar_address(pi, addr, idx - 1, + PCIBAR_MEMHI64); + } + break; + default: + assert(0); + } + pci_set_cfgdata32(pi, coff, bar); + + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { + pci_emul_cmdsts_write(pi, coff, *eax, bytes); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DIOSZ 8 +#define DMEMSZ 4096 +struct pci_emul_dsoftc { + uint8_t ioregs[DIOSZ]; + uint8_t memregs[2][DMEMSZ]; +}; + +#define PCI_EMUL_MSI_MSGS 4 +#define PCI_EMUL_MSIX_MSGS 16 + +static int +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = calloc(1, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + return (0); +} + +static void +pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("diow: iow too large, offset %ld size %d\n", + offset, size); + return; + } + + if (size == 1) { + sc->ioregs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; + } else if (size == 4) { + *(uint32_t *)&sc->ioregs[offset] = value; + } else { + printf("diow: iow unknown size %d\n", size); + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_maxmsgnum(pi); i++) + pci_generate_msi(pi, i); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + size > DMEMSZ) { + printf("diow: memw too large, offset %ld size %d\n", + offset, size); + return; + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + sc->memregs[i][offset] = value; + } else if (size == 2) { + *(uint16_t *)&sc->memregs[i][offset] = value; + } else if (size == 4) { + *(uint32_t *)&sc->memregs[i][offset] = value; + } else if (size == 8) { + *(uint64_t *)&sc->memregs[i][offset] = value; + } else { + printf("diow: memw unknown size %d\n", size); + } + + /* + * magic interrupt ?? + */ + } + + if (baridx > 2 || baridx < 0) { + printf("diow: unknown bar idx %d\n", baridx); + } +} + +static uint64_t +pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + int i; + + value = 0; + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("dior: ior too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->ioregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->ioregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->ioregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + size > DMEMSZ) { + printf("dior: memr too large, offset %ld size %d\n", + offset, size); + return (0); + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + value = sc->memregs[i][offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->memregs[i][offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->memregs[i][offset]; + } else if (size == 8) { + value = *(uint64_t *) &sc->memregs[i][offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + + if (baridx > 2 || baridx < 0) { + printf("dior: unknown bar idx %d\n", baridx); + return (0); + } + + return (value); +} + +struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_barwrite = pci_emul_diow, + .pe_barread = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h new file mode 100644 index 0000000000..0053caed99 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -0,0 +1,298 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _PCI_EMUL_H_ +#define _PCI_EMUL_H_ + +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/_pthreadtypes.h> + +#include <dev/pci/pcireg.h> + +#include <assert.h> + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ + +struct vmctx; +struct pci_devinst; +struct memory_region; + +struct pci_devemu { + char *pe_emu; /* Name of device emulation */ + + /* instance creation */ + int (*pe_init)(struct vmctx *, struct pci_devinst *, + char *opts); + + /* ACPI DSDT enumeration */ + void (*pe_write_dsdt)(struct pci_devinst *); + + /* config space read/write callbacks */ + int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t val); + int (*pe_cfgread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t *retval); + + /* BAR read/write callbacks */ + void (*pe_barwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value); + uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size); + +#ifndef __FreeBSD__ + void (*pe_lintrupdate)(struct pci_devinst *pi); +#endif /* __FreeBSD__ */ +}; +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; + +#define PI_NAMESZ 40 + +struct msix_table_entry { + uint64_t addr; + uint32_t msg_data; + uint32_t vector_control; +} __packed; + +/* + * In case the structure is modified to hold extra information, use a define + * for the size that should be emulated. + */ +#define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +struct pci_devinst { + struct pci_devemu *pi_d; + struct vmctx *pi_vmctx; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + int pi_bar_getsize; + int pi_prevcap; + int pi_capend; + + struct { + int8_t pin; + enum lintr_stat state; + int pirq_pin; + int ioapic_irq; + pthread_mutex_t lock; + } pi_lintr; + + struct { + int enabled; + uint64_t addr; + uint64_t msg_data; + int maxmsgnum; + } pi_msi; + + struct { + int enabled; + int table_bar; + int pba_bar; + uint32_t table_offset; + int table_count; + uint32_t pba_offset; + int pba_size; + int function_mask; + struct msix_table_entry *table; /* allocated at runtime */ + void *pba_page; + int pba_page_offset; + } pi_msix; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; + +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; +static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); + +struct msixcap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t table_info; /* bar index and offset within it */ + uint32_t pba_info; /* bar index and offset within it */ +} __packed; +static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); + +struct pciecap { + uint8_t capid; + uint8_t nextptr; + uint16_t pcie_capabilities; + + uint32_t dev_capabilities; /* all devices */ + uint16_t dev_control; + uint16_t dev_status; + + uint32_t link_capabilities; /* devices with links */ + uint16_t link_control; + uint16_t link_status; + + uint32_t slot_capabilities; /* ports with slots */ + uint16_t slot_control; + uint16_t slot_status; + + uint16_t root_control; /* root ports */ + uint16_t root_capabilities; + uint32_t root_status; + + uint32_t dev_capabilities2; /* all devices */ + uint16_t dev_control2; + uint16_t dev_status2; + + uint32_t link_capabilities2; /* devices with links */ + uint16_t link_control2; + uint16_t link_status2; + + uint32_t slot_capabilities2; /* ports with slots */ + uint16_t slot_control2; + uint16_t slot_status2; +} __packed; +static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); + +typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, + int ioapic_irq, void *arg); + +int init_pci(struct vmctx *ctx); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, + enum pcibar_type type, uint64_t size); +int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, + uint64_t hostbase, enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +void pci_generate_msix(struct pci_devinst *pi, int msgnum); +void pci_lintr_assert(struct pci_devinst *pi); +void pci_lintr_deassert(struct pci_devinst *pi); +void pci_lintr_request(struct pci_devinst *pi); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msix_enabled(struct pci_devinst *pi); +int pci_msix_table_bar(struct pci_devinst *pi); +int pci_msix_pba_bar(struct pci_devinst *pi); +int pci_msi_maxmsgnum(struct pci_devinst *pi); +int pci_parse_slot(char *opt); +void pci_print_supported_devices(); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); +int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); +int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value); +uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); +int pci_count_lintr(int bus); +void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); +void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); +int pci_bus_configured(int bus); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(pi->pi_cfgdata + offset)); +} + +#endif /* _PCI_EMUL_H_ */ diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c new file mode 100644 index 0000000000..8d24dde9da --- /dev/null +++ b/usr/src/cmd/bhyve/pci_fbuf.c @@ -0,0 +1,467 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <errno.h> +#include <unistd.h> + +#include "bhyvegc.h" +#include "bhyverun.h" +#include "console.h" +#include "inout.h" +#include "pci_emul.h" +#include "rfb.h" +#include "vga.h" + +/* + * bhyve Framebuffer device emulation. + * BAR0 points to the current mode information. + * BAR1 is the 32-bit framebuffer address. + * + * -s <b>,fbuf,wait,vga=on|io|off,rfb=<ip>:port,w=width,h=height + */ + +static int fbuf_debug = 1; +#define DEBUG_INFO 1 +#define DEBUG_VERBOSE 4 +#define DPRINTF(level, params) if (level <= fbuf_debug) printf params + + +#define KB (1024UL) +#define MB (1024 * 1024UL) + +#define DMEMSZ 128 + +#define FB_SIZE (16*MB) + +#define COLS_MAX 1920 +#define ROWS_MAX 1200 + +#define COLS_DEFAULT 1024 +#define ROWS_DEFAULT 768 + +#define COLS_MIN 640 +#define ROWS_MIN 480 + +struct pci_fbuf_softc { + struct pci_devinst *fsc_pi; + struct { + uint32_t fbsize; + uint16_t width; + uint16_t height; + uint16_t depth; + uint16_t refreshrate; + uint8_t reserved[116]; + } __packed memregs; + + /* rfb server */ + char *rfb_host; + char *rfb_password; + int rfb_port; +#ifndef __FreeBSD__ + char *rfb_unix; +#endif + int rfb_wait; + int vga_enabled; + int vga_full; + + uint32_t fbaddr; + char *fb_base; + uint16_t gc_width; + uint16_t gc_height; + void *vgasc; + struct bhyvegc_image *gc_image; +}; + +static struct pci_fbuf_softc *fbuf_sc; + +#define PCI_FBUF_MSI_MSGS 4 + +static void +pci_fbuf_usage(char *opt) +{ + + fprintf(stderr, "Invalid fbuf emulation option \"%s\"\r\n", opt); + fprintf(stderr, "fbuf: {wait,}{vga=on|io|off,}rfb=<ip>:port" + "{,w=width}{,h=height}\r\n"); +} + +static void +pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + + assert(baridx == 0); + + sc = pi->pi_arg; + + DPRINTF(DEBUG_VERBOSE, + ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + if (offset + size > DMEMSZ) { + printf("fbuf: write too large, offset %ld size %d\n", + offset, size); + return; + } + + p = (uint8_t *)&sc->memregs + offset; + + switch (size) { + case 1: + *p = value; + break; + case 2: + *(uint16_t *)p = value; + break; + case 4: + *(uint32_t *)p = value; + break; + case 8: + *(uint64_t *)p = value; + break; + default: + printf("fbuf: write unknown size %d\n", size); + break; + } + + if (!sc->gc_image->vgamode && sc->memregs.width == 0 && + sc->memregs.height == 0) { + DPRINTF(DEBUG_INFO, ("switching to VGA mode\r\n")); + sc->gc_image->vgamode = 1; + sc->gc_width = 0; + sc->gc_height = 0; + } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && + sc->memregs.height != 0) { + DPRINTF(DEBUG_INFO, ("switching to VESA mode\r\n")); + sc->gc_image->vgamode = 0; + } +} + +uint64_t +pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + uint64_t value; + + assert(baridx == 0); + + sc = pi->pi_arg; + + + if (offset + size > DMEMSZ) { + printf("fbuf: read too large, offset %ld size %d\n", + offset, size); + return (0); + } + + p = (uint8_t *)&sc->memregs + offset; + value = 0; + switch (size) { + case 1: + value = *p; + break; + case 2: + value = *(uint16_t *)p; + break; + case 4: + value = *(uint32_t *)p; + break; + case 8: + value = *(uint64_t *)p; + break; + default: + printf("fbuf: read unknown size %d\n", size); + break; + } + + DPRINTF(DEBUG_VERBOSE, + ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + return (value); +} + +static int +pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts) +{ + char *uopts, *xopts, *config; + char *tmpstr; + int ret; + + ret = 0; + uopts = strdup(opts); + for (xopts = strtok(uopts, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (strcmp(xopts, "wait") == 0) { + sc->rfb_wait = 1; + continue; + } + + if ((config = strchr(xopts, '=')) == NULL) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + + *config++ = '\0'; + + DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s\r\n", + xopts, config)); + + if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) { + /* + * IPv4 -- host-ip:port + * IPv6 -- [host-ip%zone]:port + * XXX for now port is mandatory. + */ + tmpstr = strsep(&config, "]"); + if (config) { + if (tmpstr[0] == '[') + tmpstr++; + sc->rfb_host = tmpstr; + if (config[0] == ':') + config++; + else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + sc->rfb_port = atoi(config); + } else { + config = tmpstr; + tmpstr = strsep(&config, ":"); + if (!config) + sc->rfb_port = atoi(tmpstr); + else { + sc->rfb_port = atoi(config); + sc->rfb_host = tmpstr; + } + } +#ifndef __FreeBSD__ + } else if (!strcmp(xopts, "unix")) { + sc->rfb_unix = config; +#endif + } else if (!strcmp(xopts, "vga")) { + if (!strcmp(config, "off")) { + sc->vga_enabled = 0; + } else if (!strcmp(config, "io")) { + sc->vga_enabled = 1; + sc->vga_full = 0; + } else if (!strcmp(config, "on")) { + sc->vga_enabled = 1; + sc->vga_full = 1; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } else if (!strcmp(xopts, "w")) { + sc->memregs.width = atoi(config); + if (sc->memregs.width > COLS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.width == 0) + sc->memregs.width = 1920; + } else if (!strcmp(xopts, "h")) { + sc->memregs.height = atoi(config); + if (sc->memregs.height > ROWS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.height == 0) + sc->memregs.height = 1080; + } else if (!strcmp(xopts, "password")) { + sc->rfb_password = config; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } + +done: + return (ret); +} + + +extern void vga_render(struct bhyvegc *gc, void *arg); + +void +pci_fbuf_render(struct bhyvegc *gc, void *arg) +{ + struct pci_fbuf_softc *sc; + + sc = arg; + + if (sc->vga_full && sc->gc_image->vgamode) { + /* TODO: mode switching to vga and vesa should use the special + * EFI-bhyve protocol port. + */ + vga_render(gc, sc->vgasc); + return; + } + if (sc->gc_width != sc->memregs.width || + sc->gc_height != sc->memregs.height) { + bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); + sc->gc_width = sc->memregs.width; + sc->gc_height = sc->memregs.height; + } + + return; +} + +static int +pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error, prot; + struct pci_fbuf_softc *sc; + + if (fbuf_sc != NULL) { + fprintf(stderr, "Only one frame buffer device is allowed.\n"); + return (-1); + } + + sc = calloc(1, sizeof(struct pci_fbuf_softc)); + + pi->pi_arg = sc; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); + assert(error == 0); + + error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); + assert(error == 0); + + sc->fbaddr = pi->pi_bar[1].addr; + sc->memregs.fbsize = FB_SIZE; + sc->memregs.width = COLS_DEFAULT; + sc->memregs.height = ROWS_DEFAULT; + sc->memregs.depth = 32; + + sc->vga_enabled = 1; + sc->vga_full = 0; + + sc->fsc_pi = pi; + + error = pci_fbuf_parse_opts(sc, opts); + if (error != 0) + goto done; + + /* XXX until VGA rendering is enabled */ + if (sc->vga_full != 0) { + fprintf(stderr, "pci_fbuf: VGA rendering not enabled"); + goto done; + } + + sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); + if (sc->fb_base == MAP_FAILED) { + error = -1; + goto done; + } + DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]\r\n", + sc->fb_base, FB_SIZE)); + + /* + * Map the framebuffer into the guest address space. + * XXX This may fail if the BAR is different than a prior + * run. In this case flag the error. This will be fixed + * when a change_memseg api is available. + */ + prot = PROT_READ | PROT_WRITE; + if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) { + fprintf(stderr, "pci_fbuf: mapseg failed - try deleting VM and restarting\n"); + error = -1; + goto done; + } + + console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); + console_fb_register(pci_fbuf_render, sc); + + if (sc->vga_enabled) + sc->vgasc = vga_init(!sc->vga_full); + sc->gc_image = console_get_image(); + + fbuf_sc = sc; + + memset((void *)sc->fb_base, 0, FB_SIZE); + +#ifdef __FreeBSD__ + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); +#else + if (sc->rfb_unix != NULL) { + error = rfb_init_unix(sc->rfb_unix, sc->rfb_wait, + sc->rfb_password); + } else { + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, + sc->rfb_password); + } +#endif +done: + if (error) + free(sc); + + return (error); +} + +struct pci_devemu pci_fbuf = { + .pe_emu = "fbuf", + .pe_init = pci_fbuf_init, + .pe_barwrite = pci_fbuf_write, + .pe_barread = pci_fbuf_read +}; +PCI_EMUL_SET(pci_fbuf); diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c new file mode 100644 index 0000000000..b926c7817e --- /dev/null +++ b/usr/src/cmd/bhyve/pci_hostbridge.c @@ -0,0 +1,236 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +#ifndef __FreeBSD__ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <strings.h> +#endif +__FBSDID("$FreeBSD$"); + +#include "pci_emul.h" + +#ifdef __FreeBSD__ +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); + + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + (void) pci_hostbridge_init(ctx, pi, opts); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + + return (0); +} +#else +static void +pci_hostbridge_setup(struct pci_devinst *pi, uint16_t vendor, uint16_t device) +{ + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, vendor); + pci_set_cfgdata16(pi, PCIR_DEVICE, device); + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); +} + + +static int +pci_hostbridge_parse_pci_val(const char *in, uint16_t *val) +{ + long num; + char *endp = NULL; + + errno = 0; + num = strtol(in, &endp, 0); + if (errno != 0 || endp == NULL || *endp != '\0') { + fprintf(stderr, "pci_hostbridge: invalid num '%s'", in); + return (-1); + } else if (num < 1 || num > UINT16_MAX) { + fprintf(stderr, "pci_hostbridge: 0x%04lx out of range", num); + return (-1); + } + *val = num; + return (0); +} + +static struct pci_hostbridge_model { + const char *phm_model; + uint16_t phm_vendor; + uint16_t phm_device; +} pci_hb_models[] = { + { "amd", 0x1022, 0x7432 }, /* AMD/made-up */ + { "netapp", 0x1275, 0x1275 }, /* NetApp/NetApp */ + { "i440fx", 0x8086, 0x1237 }, /* Intel/82441 */ + { "q35", 0x8086, 0x29b0 }, /* Intel/Q35 HB */ +}; + +#define NUM_HB_MODELS (sizeof (pci_hb_models) / sizeof (pci_hb_models[0])) + +static int +pci_hostbridge_parse_args(char *opts, uint16_t *vendorp, uint16_t *devicep) +{ + const char *model = NULL; + char *next; + uint16_t vendor = 0, device = 0; + int err = 0; + + for (; opts != NULL && *opts != '\0'; opts = next) { + char *val, *cp; + + if ((cp = strchr(opts, ',')) != NULL) { + *cp = '\0'; + next = cp + 1; + } else { + next = NULL; + } + + if ((cp = strchr(opts, '=')) == NULL) { + fprintf(stderr, + "pci_hostbridge: expected value for param" + " (%s=VAL)", opts); + err = -1; + continue; + } + + /* <param>=<value> handling */ + val = cp + 1; + *cp = '\0'; + if (strcmp(opts, "model") == 0) { + model = val; + } else if (strcmp(opts, "vendor") == 0) { + if (pci_hostbridge_parse_pci_val(val, &vendor) != 0) { + err = -1; + continue; + } + } else if (strcmp(opts, "device") == 0) { + if (pci_hostbridge_parse_pci_val(val, &device) != 0) { + err = -1; + continue; + } + } else { + fprintf(stderr, + "pci_hostbridge: unrecognized option '%s'", opts); + err = -1; + continue; + } + } + if (err != 0) { + return (err); + } + + if (model != NULL && (vendor != 0 || device != 0)) { + fprintf(stderr, "pci_hostbridge: cannot specify model " + "and vendor/device"); + return (-1); + } else if ((vendor != 0 && device == 0) || + (vendor == 0 && device != 0)) { + fprintf(stderr, "pci_hostbridge: must specify both vendor and" + "device for custom hostbridge"); + return (-1); + } + if (model != NULL) { + uint_t i; + + for (i = 0; i < NUM_HB_MODELS; i++) { + if (strcmp(model, pci_hb_models[i].phm_model) != 0) + continue; + + /* found a model match */ + *vendorp = pci_hb_models[i].phm_vendor; + *devicep = pci_hb_models[i].phm_device; + return (0); + } + fprintf(stderr, "pci_hostbridge: invalid model '%s'", model); + return (-1); + } + + /* custom hostbridge ID was specified */ + *vendorp = vendor; + *devicep = device; + return (0); +} + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + uint16_t vendor, device; + + if (opts == NULL) { + /* Fall back to NetApp default if no options are specified */ + vendor = 0x1275; + device = 0x1275; + } else if (pci_hostbridge_parse_args(opts, &vendor, &device) != 0) { + return (-1); + } + + pci_hostbridge_setup(pi, vendor, device); + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + pci_hostbridge_setup(pi, 0x1022, 0x7432); + return (0); +} + +#endif /* __FreeBSD__ */ + +struct pci_devemu pci_de_amd_hostbridge = { + .pe_emu = "amd_hostbridge", + .pe_init = pci_amd_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_amd_hostbridge); + +struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c new file mode 100644 index 0000000000..4ecb3eddb0 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_irq.c @@ -0,0 +1,354 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <machine/vmm.h> + +#include <assert.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +/* + * Implement an 8 pin PCI interrupt router compatible with the router + * present on Intel's ICH10 chip. + */ + +/* Fields in each PIRQ register. */ +#define PIRQ_DIS 0x80 +#define PIRQ_IRQ 0x0f + +/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */ +#define PERMITTED_IRQS 0xdef8 +#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0) + +/* IRQ count to disable an IRQ. */ +#define IRQ_DISABLED 0xff + +static struct pirq { + uint8_t reg; + int use_count; + int active_count; + pthread_mutex_t lock; +} pirqs[8]; + +static u_char irq_counts[16]; +static int pirq_cold = 1; + +/* + * Returns true if this pin is enabled with a valid IRQ. Setting the + * register to a reserved IRQ causes interrupts to not be asserted as + * if the pin was disabled. + */ +static bool +pirq_valid_irq(int reg) +{ + + if (reg & PIRQ_DIS) + return (false); + return (IRQ_PERMITTED(reg & PIRQ_IRQ)); +} + +uint8_t +pirq_read(int pin) +{ + + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg); +} + +void +pirq_write(struct vmctx *ctx, int pin, uint8_t val) +{ + struct pirq *pirq; + + assert(pin > 0 && pin <= nitems(pirqs)); + pirq = &pirqs[pin - 1]; + pthread_mutex_lock(&pirq->lock); + if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) { + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ); + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + } + pthread_mutex_unlock(&pirq->lock); +} + +void +pci_irq_reserve(int irq) +{ + + assert(irq >= 0 && irq < nitems(irq_counts)); + assert(pirq_cold); + assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); + irq_counts[irq] = IRQ_DISABLED; +} + +void +pci_irq_use(int irq) +{ + + assert(irq >= 0 && irq < nitems(irq_counts)); + assert(pirq_cold); + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; +} + +void +pci_irq_init(struct vmctx *ctx) +{ + int i; + + for (i = 0; i < nitems(pirqs); i++) { + pirqs[i].reg = PIRQ_DIS; + pirqs[i].use_count = 0; + pirqs[i].active_count = 0; + pthread_mutex_init(&pirqs[i].lock, NULL); + } + for (i = 0; i < nitems(irq_counts); i++) { + if (IRQ_PERMITTED(i)) + irq_counts[i] = 0; + else + irq_counts[i] = IRQ_DISABLED; + } +} + +void +pci_irq_assert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count++; + if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) { + vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +void +pci_irq_deassert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count--; + if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) { + vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +int +pirq_alloc_pin(struct pci_devinst *pi) +{ + struct vmctx *ctx = pi->pi_vmctx; + int best_count, best_irq, best_pin, irq, pin; + + pirq_cold = 0; + + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8; + } else { + /* Find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; pin < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } + } + } + pirqs[best_pin].use_count++; + + /* Second, route this pin to an IRQ. */ + if (pirqs[best_pin].reg == PIRQ_DIS) { + best_irq = -1; + best_count = 0; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (irq_counts[irq] == IRQ_DISABLED) + continue; + if (best_irq == -1 || irq_counts[irq] < best_count) { + best_irq = irq; + best_count = irq_counts[irq]; + } + } + assert(best_irq >= 0); + irq_counts[best_irq]++; + pirqs[best_pin].reg = best_irq; + vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); + } + + return (best_pin + 1); +} + +int +pirq_irq(int pin) +{ + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg & PIRQ_IRQ); +} + +/* XXX: Generate $PIR table. */ + +static void +pirq_dsdt(void) +{ + char *irq_prs, *old; + int irq, pin; + + irq_prs = NULL; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (!IRQ_PERMITTED(irq)) + continue; + if (irq_prs == NULL) + asprintf(&irq_prs, "%d", irq); + else { + old = irq_prs; + asprintf(&irq_prs, "%s,%d", old, irq); + free(old); + } + } + + /* + * A helper method to validate a link register's value. This + * duplicates pirq_valid_irq(). + */ + dsdt_line(""); + dsdt_line("Method (PIRV, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ); + dsdt_line(" If (LLess (Local0, 0x03))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x08))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x0D))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" Return (0x01)"); + dsdt_line("}"); + + for (pin = 0; pin < nitems(pirqs); pin++) { + dsdt_line(""); + dsdt_line("Device (LNK%c)", 'A' + pin); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))"); + dsdt_line(" Name (_UID, 0x%02X)", pin + 1); + dsdt_line(" Method (_STA, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" If (PIRV (PIR%c))", 'A' + pin); + dsdt_line(" {"); + dsdt_line(" Return (0x0B)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (0x09)"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line(" Name (_PRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {%s}", irq_prs); + dsdt_line(" })"); + dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {}"); + dsdt_line(" })"); + dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)", + pin + 1, 'A' + pin); + dsdt_line(" Method (_CRS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin, + PIRQ_DIS | PIRQ_IRQ); + dsdt_line(" If (PIRV (Local0))"); + dsdt_line(" {"); + dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Store (0x00, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Return (CB%02X)", pin + 1); + dsdt_line(" }"); + dsdt_line(" Method (_DIS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Store (0x80, PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Method (_SRS, 1, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin); + dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin); + dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line("}"); + } + free(irq_prs); +} +LPC_DSDT(pirq_dsdt); diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h new file mode 100644 index 0000000000..1ae56efc8f --- /dev/null +++ b/usr/src/cmd/bhyve/pci_irq.h @@ -0,0 +1,47 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __PCI_IRQ_H__ +#define __PCI_IRQ_H__ + +struct pci_devinst; + +void pci_irq_assert(struct pci_devinst *pi); +void pci_irq_deassert(struct pci_devinst *pi); +void pci_irq_init(struct vmctx *ctx); +void pci_irq_reserve(int irq); +void pci_irq_use(int irq); +int pirq_alloc_pin(struct pci_devinst *pi); +int pirq_irq(int pin); +uint8_t pirq_read(int pin); +void pirq_write(struct vmctx *ctx, int pin, uint8_t val); + +#endif diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c new file mode 100644 index 0000000000..b7ddb772a1 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -0,0 +1,481 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <machine/vmm.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <vmmapi.h> + +#include "acpi.h" +#include "bootrom.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "uart_emul.h" + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 + +SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); +SET_DECLARE(lpc_sysres_set, struct lpc_sysres); + +#define ELCR_PORT 0x4d0 +SYSRES_IO(ELCR_PORT, 2); + +#define IO_TIMER1_PORT 0x40 + +#define NMISC_PORT 0x61 +SYSRES_IO(NMISC_PORT, 1); + +static struct pci_devinst *lpc_bridge; + +static const char *romfile; + +#define LPC_UART_NUM 2 +static struct lpc_uart_softc { + struct uart_softc *uart_softc; + const char *opts; + int iobase; + int irq; + int enabled; +} lpc_uart_softc[LPC_UART_NUM]; + +static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; + +/* + * LPC device configuration is in the following form: + * <lpc_device_name>[,<options>] + * For e.g. "com1,stdio" or "bootrom,/var/romfile" + */ +int +lpc_device_parse(const char *opts) +{ + int unit, error; + char *str, *cpy, *lpcdev; + + error = -1; + str = cpy = strdup(opts); + lpcdev = strsep(&str, ","); + if (lpcdev != NULL) { + if (strcasecmp(lpcdev, "bootrom") == 0) { + romfile = str; + error = 0; + goto done; + } + for (unit = 0; unit < LPC_UART_NUM; unit++) { + if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { + lpc_uart_softc[unit].opts = str; + error = 0; + goto done; + } + } + } + +done: + if (error) + free(cpy); + + return (error); +} + +void +lpc_print_supported_devices() +{ + size_t i; + + printf("bootrom\n"); + for (i = 0; i < LPC_UART_NUM; i++) + printf("%s\n", lpc_uart_names[i]); +} + +const char * +lpc_bootrom(void) +{ + + return (romfile); +} + +static void +lpc_uart_intr_assert(void *arg) +{ + struct lpc_uart_softc *sc = arg; + + assert(sc->irq >= 0); + + vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); +} + +static void +lpc_uart_intr_deassert(void *arg) +{ + /* + * The COM devices on the LPC bus generate edge triggered interrupts, + * so nothing more to do here. + */ +} + +static int +lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int offset; + struct lpc_uart_softc *sc = arg; + + offset = port - sc->iobase; + + switch (bytes) { + case 1: + if (in) + *eax = uart_read(sc->uart_softc, offset); + else + uart_write(sc->uart_softc, offset, *eax); + break; + case 2: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + } + break; +#ifndef __FreeBSD__ + case 4: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + *eax |= uart_read(sc->uart_softc, offset + 2) << 16; + *eax |= uart_read(sc->uart_softc, offset + 3) << 24; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + uart_write(sc->uart_softc, offset + 2, *eax >> 16); + uart_write(sc->uart_softc, offset + 3, *eax >> 24); + } + break; +#endif + default: + return (-1); + } + + return (0); +} + +static int +lpc_init(struct vmctx *ctx) +{ + struct lpc_uart_softc *sc; + struct inout_port iop; + const char *name; + int unit, error; + + if (romfile != NULL) { + error = bootrom_init(ctx, romfile); + if (error) + return (error); + } + + /* COM1 and COM2 */ + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + name = lpc_uart_names[unit]; + + if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { + fprintf(stderr, "Unable to allocate resources for " + "LPC device %s\n", name); + return (-1); + } + pci_irq_reserve(sc->irq); + + sc->uart_softc = uart_init(lpc_uart_intr_assert, + lpc_uart_intr_deassert, sc); + + if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' " + "for LPC device %s\n", sc->opts, name); + return (-1); + } + + bzero(&iop, sizeof(struct inout_port)); + iop.name = name; + iop.port = sc->iobase; + iop.size = UART_IO_BAR_SIZE; + iop.flags = IOPORT_F_INOUT; + iop.handler = lpc_uart_io_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + sc->enabled = 1; + } + + return (0); +} + +static void +pci_lpc_write_dsdt(struct pci_devinst *pi) +{ + struct lpc_dsdt **ldpp, *ldp; + + dsdt_line(""); + dsdt_line("Device (ISA)"); + dsdt_line("{"); + dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); + dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); + dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); + dsdt_line(" {"); + dsdt_line(" Offset (0x60),"); + dsdt_line(" PIRA, 8,"); + dsdt_line(" PIRB, 8,"); + dsdt_line(" PIRC, 8,"); + dsdt_line(" PIRD, 8,"); + dsdt_line(" Offset (0x68),"); + dsdt_line(" PIRE, 8,"); + dsdt_line(" PIRF, 8,"); + dsdt_line(" PIRG, 8,"); + dsdt_line(" PIRH, 8"); + dsdt_line(" }"); + dsdt_line(""); + + dsdt_indent(1); + SET_FOREACH(ldpp, lpc_dsdt_set) { + ldp = *ldpp; + ldp->handler(); + } + + dsdt_line(""); + dsdt_line("Device (PIC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_ICU1, 2); + dsdt_fixed_ioport(IO_ICU2, 2); + dsdt_fixed_irq(2); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (TIMR)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_TIMER1_PORT, 4); + dsdt_fixed_irq(0); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + dsdt_unindent(1); + + dsdt_line("}"); +} + +static void +pci_lpc_sysres_dsdt(void) +{ + struct lpc_sysres **lspp, *lsp; + + dsdt_line(""); + dsdt_line("Device (SIO)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + + dsdt_indent(2); + SET_FOREACH(lspp, lpc_sysres_set) { + lsp = *lspp; + switch (lsp->type) { + case LPC_SYSRES_IO: + dsdt_fixed_ioport(lsp->base, lsp->length); + break; + case LPC_SYSRES_MEM: + dsdt_fixed_mem32(lsp->base, lsp->length); + break; + } + } + dsdt_unindent(2); + + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(pci_lpc_sysres_dsdt); + +static void +pci_lpc_uart_dsdt(void) +{ + struct lpc_uart_softc *sc; + int unit; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + if (!sc->enabled) + continue; + dsdt_line(""); + dsdt_line("Device (%s)", lpc_uart_names[unit]); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); + dsdt_line(" Name (_UID, %d)", unit + 1); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); + dsdt_fixed_irq(sc->irq); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + } +} +LPC_DSDT(pci_lpc_uart_dsdt); + +static int +pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int pirq_pin; + + if (bytes == 1) { + pirq_pin = 0; + if (coff >= 0x60 && coff <= 0x63) + pirq_pin = coff - 0x60 + 1; + if (coff >= 0x68 && coff <= 0x6b) + pirq_pin = coff - 0x68 + 5; + if (pirq_pin != 0) { + pirq_write(ctx, pirq_pin, val); + pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); + return (0); + } + } + return (-1); +} + +static void +pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ +} + +static uint64_t +pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + return (0); +} + +#define LPC_DEV 0x7000 +#define LPC_VENDOR 0x8086 + +static int +pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* + * Do not allow more than one LPC bridge to be configured. + */ + if (lpc_bridge != NULL) { + fprintf(stderr, "Only one LPC bridge is allowed.\n"); + return (-1); + } + + /* + * Enforce that the LPC can only be configured on bus 0. This + * simplifies the ACPI DSDT because it can provide a decode for + * all legacy i/o ports behind bus 0. + */ + if (pi->pi_bus != 0) { + fprintf(stderr, "LPC bridge can be present only on bus 0.\n"); + return (-1); + } + + if (lpc_init(ctx) != 0) + return (-1); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + + lpc_bridge = pi; + + return (0); +} + +char * +lpc_pirq_name(int pin) +{ + char *name; + + if (lpc_bridge == NULL) + return (NULL); + asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); + return (name); +} + +void +lpc_pirq_routed(void) +{ + int pin; + + if (lpc_bridge == NULL) + return; + + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); +} + +struct pci_devemu pci_de_lpc = { + .pe_emu = "lpc", + .pe_init = pci_lpc_init, + .pe_write_dsdt = pci_lpc_write_dsdt, + .pe_cfgwrite = pci_lpc_cfgwrite, + .pe_barwrite = pci_lpc_write, + .pe_barread = pci_lpc_read +}; +PCI_EMUL_SET(pci_de_lpc); diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h new file mode 100644 index 0000000000..9041f79c50 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_lpc.h @@ -0,0 +1,76 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LPC_H_ +#define _LPC_H_ + +#include <sys/linker_set.h> + +typedef void (*lpc_write_dsdt_t)(void); + +struct lpc_dsdt { + lpc_write_dsdt_t handler; +}; + +#define LPC_DSDT(handler) \ + static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \ + (handler), \ + }; \ + DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__)) + +enum lpc_sysres_type { + LPC_SYSRES_IO, + LPC_SYSRES_MEM +}; + +struct lpc_sysres { + enum lpc_sysres_type type; + uint32_t base; + uint32_t length; +}; + +#define LPC_SYSRES(type, base, length) \ + static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \ + (type), \ + (base), \ + (length) \ + }; \ + DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__)) + +#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length) +#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) + +int lpc_device_parse(const char *opt); +void lpc_print_supported_devices(); +char *lpc_pirq_name(int pin); +void lpc_pirq_routed(void); +const char *lpc_bootrom(void); + +#endif diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c new file mode 100644 index 0000000000..387611c888 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_nvme.c @@ -0,0 +1,1897 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 Shunsuke Mie + * Copyright (c) 2018 Leon Dang + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * bhyve PCIe-NVMe device emulation. + * + * options: + * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z + * + * accepted devpath: + * /dev/blockdev + * /path/to/image + * ram=size_in_MiB + * + * maxq = max number of queues + * qsz = max elements in each queue + * ioslots = max number of concurrent io requests + * sectsz = sector size (defaults to blockif sector size) + * ser = serial number (20-chars max) + * + */ + +/* TODO: + - create async event for smart and log + - intr coalesce + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <machine/atomic.h> +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <dev/nvme/nvme.h> + +#include "bhyverun.h" +#include "block_if.h" +#include "pci_emul.h" + + +static int nvme_debug = 0; +#define DPRINTF(params) if (nvme_debug) printf params +#define WPRINTF(params) printf params + +/* defaults; can be overridden */ +#define NVME_MSIX_BAR 4 + +#define NVME_IOSLOTS 8 + +#define NVME_QUEUES 16 +#define NVME_MAX_QENTRIES 2048 + +#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) +#define NVME_MAX_BLOCKIOVS 512 + +/* helpers */ + +/* Convert a zero-based value into a one-based value */ +#define ONE_BASED(zero) ((zero) + 1) +/* Convert a one-based value into a zero-based value */ +#define ZERO_BASED(one) ((one) - 1) + +/* Encode number of SQ's and CQ's for Set/Get Features */ +#define NVME_FEATURE_NUM_QUEUES(sc) \ + (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ + (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; + +#define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) + +enum nvme_controller_register_offsets { + NVME_CR_CAP_LOW = 0x00, + NVME_CR_CAP_HI = 0x04, + NVME_CR_VS = 0x08, + NVME_CR_INTMS = 0x0c, + NVME_CR_INTMC = 0x10, + NVME_CR_CC = 0x14, + NVME_CR_CSTS = 0x1c, + NVME_CR_NSSR = 0x20, + NVME_CR_AQA = 0x24, + NVME_CR_ASQ_LOW = 0x28, + NVME_CR_ASQ_HI = 0x2c, + NVME_CR_ACQ_LOW = 0x30, + NVME_CR_ACQ_HI = 0x34, +}; + +enum nvme_cmd_cdw11 { + NVME_CMD_CDW11_PC = 0x0001, + NVME_CMD_CDW11_IEN = 0x0002, + NVME_CMD_CDW11_IV = 0xFFFF0000, +}; + +#define NVME_CQ_INTEN 0x01 +#define NVME_CQ_INTCOAL 0x02 + +struct nvme_completion_queue { + struct nvme_completion *qbase; + uint32_t size; + uint16_t tail; /* nvme progress */ + uint16_t head; /* guest progress */ + uint16_t intr_vec; + uint32_t intr_en; + pthread_mutex_t mtx; +}; + +struct nvme_submission_queue { + struct nvme_command *qbase; + uint32_t size; + uint16_t head; /* nvme progress */ + uint16_t tail; /* guest progress */ + uint16_t cqid; /* completion queue id */ + int busy; /* queue is being processed */ + int qpriority; +}; + +enum nvme_storage_type { + NVME_STOR_BLOCKIF = 0, + NVME_STOR_RAM = 1, +}; + +struct pci_nvme_blockstore { + enum nvme_storage_type type; + void *ctx; + uint64_t size; + uint32_t sectsz; + uint32_t sectsz_bits; +}; + +struct pci_nvme_ioreq { + struct pci_nvme_softc *sc; + struct pci_nvme_ioreq *next; + struct nvme_submission_queue *nvme_sq; + uint16_t sqid; + + /* command information */ + uint16_t opc; + uint16_t cid; + uint32_t nsid; + + uint64_t prev_gpaddr; + size_t prev_size; + + /* + * lock if all iovs consumed (big IO); + * complete transaction before continuing + */ + pthread_mutex_t mtx; + pthread_cond_t cv; + + struct blockif_req io_req; + + /* pad to fit up to 512 page descriptors from guest IO request */ + struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; +}; + +struct pci_nvme_softc { + struct pci_devinst *nsc_pi; + + pthread_mutex_t mtx; + + struct nvme_registers regs; + + struct nvme_namespace_data nsdata; + struct nvme_controller_data ctrldata; + + struct pci_nvme_blockstore nvstore; + + uint16_t max_qentries; /* max entries per queue */ + uint32_t max_queues; /* max number of IO SQ's or CQ's */ + uint32_t num_cqueues; + uint32_t num_squeues; + + struct pci_nvme_ioreq *ioreqs; + struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ + uint32_t pending_ios; + uint32_t ioslots; + sem_t iosemlock; + + /* + * Memory mapped Submission and Completion queues + * Each array includes both Admin and IO queues + */ + struct nvme_completion_queue *compl_queues; + struct nvme_submission_queue *submit_queues; + + /* controller features */ + uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ + uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ + uint32_t async_ev_config; /* 0x0B: async event config */ +}; + + +static void pci_nvme_io_partial(struct blockif_req *br, int err); + +/* Controller Configuration utils */ +#define NVME_CC_GET_EN(cc) \ + ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) +#define NVME_CC_GET_CSS(cc) \ + ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) +#define NVME_CC_GET_SHN(cc) \ + ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) +#define NVME_CC_GET_IOSQES(cc) \ + ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) +#define NVME_CC_GET_IOCQES(cc) \ + ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) + +#define NVME_CC_WRITE_MASK \ + ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ + (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ + (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) + +#define NVME_CC_NEN_WRITE_MASK \ + ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ + (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ + (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) + +/* Controller Status utils */ +#define NVME_CSTS_GET_RDY(sts) \ + ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) + +#define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) + +/* Completion Queue status word utils */ +#define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) +#define NVME_STATUS_MASK \ + ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ + (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) + +static __inline void +cpywithpad(char *dst, size_t dst_size, const char *src, char pad) +{ + size_t len; + + len = strnlen(src, dst_size); + memset(dst, pad, dst_size); + memcpy(dst, src, len); +} + +static __inline void +pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) +{ + + *status &= ~NVME_STATUS_MASK; + *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | + (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; +} + +static __inline void +pci_nvme_status_genc(uint16_t *status, uint16_t code) +{ + + pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); +} + +static __inline void +pci_nvme_toggle_phase(uint16_t *status, int prev) +{ + + if (prev) + *status &= ~NVME_STATUS_P; + else + *status |= NVME_STATUS_P; +} + +static void +pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) +{ + struct nvme_controller_data *cd = &sc->ctrldata; + + cd->vid = 0xFB5D; + cd->ssvid = 0x0000; + + cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); + cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); + + /* Num of submission commands that we can handle at a time (2^rab) */ + cd->rab = 4; + + /* FreeBSD OUI */ + cd->ieee[0] = 0x58; + cd->ieee[1] = 0x9c; + cd->ieee[2] = 0xfc; + + cd->mic = 0; + + cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + + cd->ver = 0x00010300; + + cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; + cd->acl = 2; + cd->aerl = 4; + + cd->lpa = 0; /* TODO: support some simple things like SMART */ + cd->elpe = 0; /* max error log page entries */ + cd->npss = 1; /* number of power states support */ + + /* Warning Composite Temperature Threshold */ + cd->wctemp = 0x0157; + + cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | + (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); + cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | + (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); + cd->nn = 1; /* number of namespaces */ + + cd->fna = 0x03; + + cd->power_state[0].mp = 10; +} + +static void +pci_nvme_init_nsdata(struct pci_nvme_softc *sc) +{ + struct nvme_namespace_data *nd; + + nd = &sc->nsdata; + + nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; + nd->ncap = nd->nsze; + nd->nuse = nd->nsze; + + /* Get LBA and backstore information from backing store */ + nd->nlbaf = 1; + /* LBA data-sz = 2^lbads */ + nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; + + nd->flbas = 0; +} + +static void +pci_nvme_reset_locked(struct pci_nvme_softc *sc) +{ + DPRINTF(("%s\r\n", __func__)); + + sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | + (1 << NVME_CAP_LO_REG_CQR_SHIFT) | + (60 << NVME_CAP_LO_REG_TO_SHIFT); + + sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; + + sc->regs.vs = 0x00010300; /* NVMe v1.3 */ + + sc->regs.cc = 0; + sc->regs.csts = 0; + + sc->num_cqueues = sc->num_squeues = sc->max_queues; + if (sc->submit_queues != NULL) { + for (int i = 0; i < sc->num_squeues + 1; i++) { + /* + * The Admin Submission Queue is at index 0. + * It must not be changed at reset otherwise the + * emulation will be out of sync with the guest. + */ + if (i != 0) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + } + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + sc->submit_queues[i].busy = 0; + } + } else + sc->submit_queues = calloc(sc->num_squeues + 1, + sizeof(struct nvme_submission_queue)); + + if (sc->compl_queues != NULL) { + for (int i = 0; i < sc->num_cqueues + 1; i++) { + /* See Admin Submission Queue note above */ + if (i != 0) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + } + + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; + } + } else { + sc->compl_queues = calloc(sc->num_cqueues + 1, + sizeof(struct nvme_completion_queue)); + + for (int i = 0; i < sc->num_cqueues + 1; i++) + pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + } +} + +static void +pci_nvme_reset(struct pci_nvme_softc *sc) +{ + pthread_mutex_lock(&sc->mtx); + pci_nvme_reset_locked(sc); + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) +{ + uint16_t acqs, asqs; + + DPRINTF(("%s\r\n", __func__)); + + asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; + sc->submit_queues[0].size = asqs; + sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, + sizeof(struct nvme_command) * asqs); + + DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.asq, sc->submit_queues[0].qbase)); + + acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & + NVME_AQA_REG_ACQS_MASK) + 1; + sc->compl_queues[0].size = acqs; + sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, + sizeof(struct nvme_completion) * acqs); + DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.acq, sc->compl_queues[0].qbase)); +} + +static int +nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_squeues) { + WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->submit_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_submission_queue *nsq; + + if ((qid == 0) || (qid > sc->num_squeues)) { + WPRINTF(("%s queue index %u > num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + nsq = &sc->submit_queues[qid]; + nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * (size_t)nsq->size); + nsq->cqid = (command->cdw11 >> 16) & 0xffff; + nsq->qpriority = (command->cdw11 >> 1) & 0x03; + + DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, + qid, nsq->size, nsq->qbase, nsq->cqid)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + DPRINTF(("%s completed creating IOSQ qid %u\r\n", + __func__, qid)); + } else { + /* + * Guest sent non-cont submission queue request. + * This setting is unsupported by this emulation. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o submission queue\r\n", __func__)); + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + } + return (1); +} + +static int +nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_cqueues) { + WPRINTF(("%s queue index %u / num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->compl_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_completion_queue *ncq; + + if ((qid == 0) || (qid > sc->num_cqueues)) { + WPRINTF(("%s queue index %u > num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + ncq = &sc->compl_queues[qid]; + ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; + ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; + ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + command->prp1, + sizeof(struct nvme_command) * (size_t)ncq->size); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } else { + /* + * Non-contig completion queue unsupported. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o completion queue\r\n", + __func__)); + + /* 0x12 = Invalid Use of Controller Memory Buffer */ + pci_nvme_status_genc(&compl->status, 0x12); + } + + return (1); +} + +static int +nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; + uint8_t logpage = command->cdw10 & 0xFF; +#ifdef __FreeBSD__ + void *data; +#else + /* Our compiler grumbles about this, despite it being OK */ + void *data = NULL; +#endif + + DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); + + if (logpage >= 1 && logpage <= 3) + data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + PAGE_SIZE); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + switch (logpage) { + case 0x01: /* Error information */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + case 0x02: /* SMART/Health information */ + /* TODO: present some smart info */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + case 0x03: /* Firmware slot information */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + default: + WPRINTF(("%s get log page %x command not supported\r\n", + __func__, logpage)); + + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_LOG_PAGE); + } + + return (1); +} + +static int +nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + void *dest; + + DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, + command->cdw10 & 0xFF, command->nsid)); + + switch (command->cdw10 & 0xFF) { + case 0x00: /* return Identify Namespace data structure */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(sc->nsdata)); + memcpy(dest, &sc->nsdata, sizeof(sc->nsdata)); + break; + case 0x01: /* return Identify Controller data structure */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(sc->ctrldata)); + memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata)); + break; + case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(uint32_t) * 1024); + ((uint32_t *)dest)[0] = 1; + ((uint32_t *)dest)[1] = 0; + break; + case 0x11: + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (1); + case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ + case 0x10: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + default: + DPRINTF(("%s unsupported identify command requested 0x%x\r\n", + __func__, command->cdw10 & 0xFF)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t nqr; /* Number of Queues Requested */ + + nqr = command->cdw11 & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_squeues = ONE_BASED(nqr); + if (sc->num_squeues > sc->max_queues) { + DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, + sc->max_queues)); + sc->num_squeues = sc->max_queues; + } + + nqr = (command->cdw11 >> 16) & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_cqueues = ONE_BASED(nqr); + if (sc->num_cqueues > sc->max_queues) { + DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, + sc->max_queues)); + sc->num_cqueues = sc->max_queues; + } + + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + return (0); +} + +static int +nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + uint32_t iv; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + nvme_set_feature_queues(sc, command, compl); + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); + + /* in uS */ + sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; + + sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + iv = command->cdw11 & 0xFFFF; + + DPRINTF((" interrupt vector configuration 0x%x\r\n", + command->cdw11)); + + for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { + if (sc->compl_queues[i].intr_vec == iv) { + if (command->cdw11 & (1 << 16)) + sc->compl_queues[i].intr_en |= + NVME_CQ_INTCOAL; + else + sc->compl_queues[i].intr_en &= + ~NVME_CQ_INTCOAL; + } + } + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration 0x%x\r\n", + command->cdw11)); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker 0x%x\r\n", + command->cdw11)); + break; + case 0x0C: + DPRINTF((" autonomous power state transition 0x%x\r\n", + command->cdw11)); + break; + default: + WPRINTF(("%s invalid feature\r\n", __func__)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration\r\n")); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management\r\n")); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range\r\n")); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold\r\n")); + switch ((command->cdw11 >> 20) & 0x3) { + case 0: + /* Over temp threshold */ + compl->cdw0 = 0xFFFF; + break; + case 1: + /* Under temp threshold */ + compl->cdw0 = 0; + break; + default: + WPRINTF((" invalid threshold type select\r\n")); + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_FIELD); + return (1); + } + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery\r\n")); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache\r\n")); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + DPRINTF((" number of queues (submit %u, completion %u)\r\n", + compl->cdw0 & 0xFFFF, + (compl->cdw0 >> 16) & 0xFFFF)); + + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing\r\n")); + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + DPRINTF((" interrupt vector configuration\r\n")); + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity\r\n")); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration\r\n")); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker\r\n")); + break; + case 0x0C: + DPRINTF((" autonomous power state transition\r\n")); + break; + default: + WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, + command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); + + /* TODO: search for the command ID and abort it */ + + compl->cdw0 = 1; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +#ifdef __FreeBSD__ +static int +nvme_opc_async_event_req(struct pci_nvme_softc* sc, + struct nvme_command* command, struct nvme_completion* compl) +{ + DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); + + /* + * TODO: raise events when they happen based on the Set Features cmd. + * These events happen async, so only set completion successful if + * there is an event reflective of the request to get event. + */ + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + return (0); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif /* __FreeBSD__ */ + +static void +pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) +{ + struct nvme_completion compl; + struct nvme_command *cmd; + struct nvme_submission_queue *sq; + struct nvme_completion_queue *cq; + int do_intr = 0; + uint16_t sqhead; + + DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); + + sq = &sc->submit_queues[0]; + + sqhead = atomic_load_acq_short(&sq->head); + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s SQ busy, head %u, tail %u\r\n", + __func__, sqhead, sq->tail)); + return; + } + + DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + cmd = &(sq->qbase)[sqhead]; + compl.status = 0; + + switch (cmd->opc) { + case NVME_OPC_DELETE_IO_SQ: + DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_SQ: + DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_DELETE_IO_CQ: + DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_CQ: + DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_GET_LOG_PAGE: + DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); + do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); + break; + case NVME_OPC_IDENTIFY: + DPRINTF(("%s command IDENTIFY\r\n", __func__)); + do_intr |= nvme_opc_identify(sc, cmd, &compl); + break; + case NVME_OPC_ABORT: + DPRINTF(("%s command ABORT\r\n", __func__)); + do_intr |= nvme_opc_abort(sc, cmd, &compl); + break; + case NVME_OPC_SET_FEATURES: + DPRINTF(("%s command SET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_set_features(sc, cmd, &compl); + break; + case NVME_OPC_GET_FEATURES: + DPRINTF(("%s command GET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_get_features(sc, cmd, &compl); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); + /* XXX dont care, unhandled for now + do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); + */ + break; + default: + WPRINTF(("0x%x command is not implemented\r\n", + cmd->opc)); + } + + /* for now skip async event generation */ + if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { + struct nvme_completion *cp; + int phase; + + cq = &sc->compl_queues[0]; + + cp = &(cq->qbase)[cq->tail]; + cp->cdw0 = compl.cdw0; + cp->sqid = 0; + cp->sqhd = sqhead; + cp->cid = cmd->cid; + + phase = NVME_STATUS_GET_P(cp->status); + cp->status = compl.status; + pci_nvme_toggle_phase(&cp->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + } + sqhead = (sqhead + 1) % sq->size; + } + + DPRINTF(("setting sqhead %u\r\n", sqhead)); + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); + + if (do_intr) + pci_generate_msix(sc->nsc_pi, 0); + +} + +static int +pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, + uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +{ + int iovidx; + + if (req != NULL) { + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); + + req->prev_size += size; + req->io_req.br_resid += size; + + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + pthread_mutex_lock(&req->mtx); + + iovidx = req->io_req.br_iovcnt; + if (iovidx == NVME_MAX_BLOCKIOVS) { + int err = 0; + + DPRINTF(("large I/O, doing partial req\r\n")); + + iovidx = 0; + req->io_req.br_iovcnt = 0; + + req->io_req.br_callback = pci_nvme_io_partial; + + if (!do_write) + err = blockif_read(sc->nvstore.ctx, + &req->io_req); + else + err = blockif_write(sc->nvstore.ctx, + &req->io_req); + + /* wait until req completes before cont */ + if (err == 0) + pthread_cond_wait(&req->cv, &req->mtx); + } + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); + + req->io_req.br_iov[iovidx].iov_len = size; + + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; + + req->io_req.br_iovcnt++; + + pthread_mutex_unlock(&req->mtx); + } + } else { + /* RAM buffer: read/write directly */ + void *p = sc->nvstore.ctx; + void *gptr; + + if ((lba + size) > sc->nvstore.size) { + WPRINTF(("%s write would overflow RAM\r\n", __func__)); + return (-1); + } + + p = (void *)((uintptr_t)p + (uintptr_t)lba); + gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); + if (do_write) + memcpy(p, gptr, size); + else + memcpy(gptr, p, size); + } + return (0); +} + +static void +pci_nvme_set_completion(struct pci_nvme_softc *sc, + struct nvme_submission_queue *sq, int sqid, uint16_t cid, + uint32_t cdw0, uint16_t status, int ignore_busy) +{ + struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; + struct nvme_completion *compl; + int do_intr = 0; + int phase; + + DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", + __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), + NVME_STATUS_GET_SC(status))); + + pthread_mutex_lock(&cq->mtx); + + assert(cq->qbase != NULL); + + compl = &cq->qbase[cq->tail]; + + compl->sqhd = atomic_load_acq_short(&sq->head); + compl->sqid = sqid; + compl->cid = cid; + + // toggle phase + phase = NVME_STATUS_GET_P(compl->status); + compl->status = status; + pci_nvme_toggle_phase(&compl->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + + if (cq->intr_en & NVME_CQ_INTEN) + do_intr = 1; + + pthread_mutex_unlock(&cq->mtx); + + if (ignore_busy || !atomic_load_acq_int(&sq->busy)) + if (do_intr) + pci_generate_msix(sc->nsc_pi, cq->intr_vec); +} + +static void +pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) +{ + req->sc = NULL; + req->nvme_sq = NULL; + req->sqid = 0; + + pthread_mutex_lock(&sc->mtx); + + req->next = sc->ioreqs_free; + sc->ioreqs_free = req; + sc->pending_ios--; + + /* when no more IO pending, can set to ready if device reset/enabled */ + if (sc->pending_ios == 0 && + NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) + sc->regs.csts |= NVME_CSTS_RDY; + + pthread_mutex_unlock(&sc->mtx); + + sem_post(&sc->iosemlock); +} + +static struct pci_nvme_ioreq * +pci_nvme_get_ioreq(struct pci_nvme_softc *sc) +{ + struct pci_nvme_ioreq *req = NULL;; + + sem_wait(&sc->iosemlock); + pthread_mutex_lock(&sc->mtx); + + req = sc->ioreqs_free; + assert(req != NULL); + + sc->ioreqs_free = req->next; + + req->next = NULL; + req->sc = sc; + + sc->pending_ios++; + + pthread_mutex_unlock(&sc->mtx); + + req->io_req.br_iovcnt = 0; + req->io_req.br_offset = 0; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + req->prev_gpaddr = 0; + req->prev_size = 0; + + return req; +} + +static void +pci_nvme_io_done(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + struct nvme_submission_queue *sq = req->nvme_sq; + uint16_t code, status = 0; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + /* TODO return correct error */ + code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_release_ioreq(req->sc, req); +} + +static void +pci_nvme_io_partial(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + pthread_cond_signal(&req->cv); +} + + +static void +pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) +{ + struct nvme_submission_queue *sq; + uint16_t status = 0; + uint16_t sqhead; + int err; + + /* handle all submissions up to sq->tail index */ + sq = &sc->submit_queues[idx]; + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); + return; + } + + sqhead = atomic_load_acq_short(&sq->head); + + DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", + idx, sqhead, sq->tail, sq->qbase)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + struct nvme_command *cmd; + struct pci_nvme_ioreq *req = NULL; + uint64_t lba; + uint64_t nblocks, bytes, size, cpsz; + + /* TODO: support scatter gather list handling */ + + cmd = &sq->qbase[sqhead]; + sqhead = (sqhead + 1) % sq->size; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + + if (cmd->opc == NVME_OPC_FLUSH) { + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } else if (cmd->opc == 0x08) { + /* TODO: write zeroes */ + WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", + __func__, lba, cmd->cdw12 & 0xFFFF)); + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + + bytes = nblocks * sc->nvstore.sectsz; + + if (sc->nvstore.type == NVME_STOR_BLOCKIF) { + req = pci_nvme_get_ioreq(sc); + req->nvme_sq = sq; + req->sqid = idx; + } + + /* + * If data starts mid-page and flows into the next page, then + * increase page count + */ + + DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " + "(%lu-bytes)\r\n", + sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, + cmd->opc == NVME_OPC_WRITE ? + "WRITE" : "READ", + lba, nblocks, bytes)); + + cmd->prp1 &= ~(0x03UL); + cmd->prp2 &= ~(0x03UL); + + DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); + + size = bytes; + lba *= sc->nvstore.sectsz; + + cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); + + if (cpsz > bytes) + cpsz = bytes; + + if (req != NULL) { + req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | + cmd->cdw10; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; + } + + err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + lba += cpsz; + size -= cpsz; + + if (size == 0) + goto iodone; + + if (size <= PAGE_SIZE) { + /* prp2 is second (and final) page in transfer */ + + err = pci_nvme_append_iov_req(sc, req, cmd->prp2, + size, + cmd->opc == NVME_OPC_WRITE, + lba); + } else { + uint64_t *prp_list; + int i; + + /* prp2 is pointer to a physical region page list */ + prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, + cmd->prp2, PAGE_SIZE); + + i = 0; + while (size != 0) { + cpsz = MIN(size, PAGE_SIZE); + + /* + * Move to linked physical region page list + * in last item. + */ + if (i == (NVME_PRP2_ITEMS-1) && + size > PAGE_SIZE) { + assert((prp_list[i] & (PAGE_SIZE-1)) == 0); + prp_list = paddr_guest2host( + sc->nsc_pi->pi_vmctx, + prp_list[i], PAGE_SIZE); + i = 0; + } + if (prp_list[i] == 0) { + WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); + err = 1; + break; + } + + err = pci_nvme_append_iov_req(sc, req, + prp_list[i], cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + if (err) + break; + + lba += cpsz; + size -= cpsz; + i++; + } + } + +iodone: + if (sc->nvstore.type == NVME_STOR_RAM) { + uint16_t code, status = 0; + + code = err ? NVME_SC_LBA_OUT_OF_RANGE : + NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + + if (err) + goto do_error; + + req->io_req.br_callback = pci_nvme_io_done; + + err = 0; + switch (cmd->opc) { + case NVME_OPC_READ: + err = blockif_read(sc->nvstore.ctx, &req->io_req); + break; + case NVME_OPC_WRITE: + err = blockif_write(sc->nvstore.ctx, &req->io_req); + break; + default: + WPRINTF(("%s unhandled io command 0x%x\r\n", + __func__, cmd->opc)); + err = 1; + } + +do_error: + if (err) { + uint16_t status = 0; + + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + pci_nvme_release_ioreq(sc, req); + } + } + + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); +} + +static void +pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t idx, int is_sq, uint64_t value) +{ + DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", + idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); + + if (is_sq) { + atomic_store_short(&sc->submit_queues[idx].tail, + (uint16_t)value); + + if (idx == 0) { + pci_nvme_handle_admin_cmd(sc, value); + } else { + /* submission queue; handle new entries in SQ */ + if (idx > sc->num_squeues) { + WPRINTF(("%s SQ index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_squeues)); + return; + } + pci_nvme_handle_io_cmd(sc, (uint16_t)idx); + } + } else { + if (idx > sc->num_cqueues) { + WPRINTF(("%s queue index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_cqueues)); + return; + } + + sc->compl_queues[idx].head = (uint16_t)value; + } +} + +static void +pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) +{ + const char *s = iswrite ? "WRITE" : "READ"; + + switch (offset) { + case NVME_CR_CAP_LOW: + DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); + break; + case NVME_CR_CAP_HI: + DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); + break; + case NVME_CR_VS: + DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); + break; + case NVME_CR_INTMS: + DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); + break; + case NVME_CR_INTMC: + DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); + break; + case NVME_CR_CC: + DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); + break; + case NVME_CR_CSTS: + DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); + break; + case NVME_CR_NSSR: + DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); + break; + case NVME_CR_AQA: + DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); + break; + case NVME_CR_ASQ_LOW: + DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); + break; + case NVME_CR_ASQ_HI: + DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); + break; + case NVME_CR_ACQ_LOW: + DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); + break; + case NVME_CR_ACQ_HI: + DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); + break; + default: + DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); + } + +} + +static void +pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t offset, int size, uint64_t value) +{ + uint32_t ccreg; + + if (offset >= NVME_DOORBELL_OFFSET) { + uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; + uint64_t idx = belloffset / 8; /* door bell size = 2*int */ + int is_sq = (belloffset % 8) < 4; + + if (belloffset > ((sc->max_queues+1) * 8 - 4)) { + WPRINTF(("guest attempted an overflow write offset " + "0x%lx, val 0x%lx in %s", + offset, value, __func__)); + return; + } + + pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); + return; + } + + DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", + offset, size, value)); + + if (size != 4) { + WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " + "val 0x%lx) to bar0 in %s", + size, offset, value, __func__)); + /* TODO: shutdown device */ + return; + } + + pci_nvme_bar0_reg_dumps(__func__, offset, 1); + + pthread_mutex_lock(&sc->mtx); + + switch (offset) { + case NVME_CR_CAP_LOW: + case NVME_CR_CAP_HI: + /* readonly */ + break; + case NVME_CR_VS: + /* readonly */ + break; + case NVME_CR_INTMS: + /* MSI-X, so ignore */ + break; + case NVME_CR_INTMC: + /* MSI-X, so ignore */ + break; + case NVME_CR_CC: + ccreg = (uint32_t)value; + + DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " + "iocqes %u\r\n", + __func__, + NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), + NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), + NVME_CC_GET_IOCQES(ccreg))); + + if (NVME_CC_GET_SHN(ccreg)) { + /* perform shutdown - flush out data to backend */ + sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << + NVME_CSTS_REG_SHST_SHIFT); + sc->regs.csts |= NVME_SHST_COMPLETE << + NVME_CSTS_REG_SHST_SHIFT; + } + if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { + if (NVME_CC_GET_EN(ccreg) == 0) + /* transition 1-> causes controller reset */ + pci_nvme_reset_locked(sc); + else + pci_nvme_init_controller(ctx, sc); + } + + /* Insert the iocqes, iosqes and en bits from the write */ + sc->regs.cc &= ~NVME_CC_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; + if (NVME_CC_GET_EN(ccreg) == 0) { + /* Insert the ams, mps and css bit fields */ + sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; + sc->regs.csts &= ~NVME_CSTS_RDY; + } else if (sc->pending_ios == 0) { + sc->regs.csts |= NVME_CSTS_RDY; + } + break; + case NVME_CR_CSTS: + break; + case NVME_CR_NSSR: + /* ignore writes; don't support subsystem reset */ + break; + case NVME_CR_AQA: + sc->regs.aqa = (uint32_t)value; + break; + case NVME_CR_ASQ_LOW: + sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ASQ_HI: + sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + case NVME_CR_ACQ_LOW: + sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ACQ_HI: + sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + default: + DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", + __func__, offset, value, size)); + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " + " value 0x%lx\r\n", baridx, offset, size, value)); + + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + + switch (baridx) { + case 0: + pci_nvme_write_bar_0(ctx, sc, offset, size, value); + break; + + default: + DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", + __func__, baridx, value)); + } +} + +static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, + uint64_t offset, int size) +{ + uint64_t value; + + pci_nvme_bar0_reg_dumps(__func__, offset, 0); + + if (offset < NVME_DOORBELL_OFFSET) { + void *p = &(sc->regs); + pthread_mutex_lock(&sc->mtx); + memcpy(&value, (void *)((uintptr_t)p + offset), size); + pthread_mutex_unlock(&sc->mtx); + } else { + value = 0; + WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); + } + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", + offset, size, (uint32_t)value)); + + return (value); +} + + + +static uint64_t +pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", + baridx, offset, size)); + + return pci_emul_msix_tread(pi, offset, size); + } + + switch (baridx) { + case 0: + return pci_nvme_read_bar_0(sc, offset, size); + + default: + DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); + } + + return (0); +} + + +static int +pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) +{ + char bident[sizeof("XX:X:X")]; + char *uopt, *xopts, *config; + uint32_t sectsz; + int optidx; + + sc->max_queues = NVME_QUEUES; + sc->max_qentries = NVME_MAX_QENTRIES; + sc->ioslots = NVME_IOSLOTS; + sc->num_squeues = sc->max_queues; + sc->num_cqueues = sc->max_queues; + sectsz = 0; + + uopt = strdup(opts); + optidx = 0; + snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), + "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + + if ((config = strchr(xopts, '=')) != NULL) + *config++ = '\0'; + + if (!strcmp("maxq", xopts)) { + sc->max_queues = atoi(config); + } else if (!strcmp("qsz", xopts)) { + sc->max_qentries = atoi(config); + } else if (!strcmp("ioslots", xopts)) { + sc->ioslots = atoi(config); + } else if (!strcmp("sectsz", xopts)) { + sectsz = atoi(config); + } else if (!strcmp("ser", xopts)) { + /* + * This field indicates the Product Serial Number in + * 7-bit ASCII, unused bytes should be space characters. + * Ref: NVMe v1.3c. + */ + cpywithpad((char *)sc->ctrldata.sn, + sizeof(sc->ctrldata.sn), config, ' '); + } else if (!strcmp("ram", xopts)) { + uint64_t sz = strtoull(&xopts[4], NULL, 10); + + sc->nvstore.type = NVME_STOR_RAM; + sc->nvstore.size = sz * 1024 * 1024; + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + sc->nvstore.sectsz = 4096; + sc->nvstore.sectsz_bits = 12; + if (sc->nvstore.ctx == NULL) { + perror("Unable to allocate RAM"); + free(uopt); + return (-1); + } + } else if (optidx == 0) { + snprintf(bident, sizeof(bident), "%d:%d", + sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + sc->nvstore.ctx = blockif_open(xopts, bident); + if (sc->nvstore.ctx == NULL) { + perror("Could not open backing file"); + free(uopt); + return (-1); + } + sc->nvstore.type = NVME_STOR_BLOCKIF; + sc->nvstore.size = blockif_size(sc->nvstore.ctx); + } else { + fprintf(stderr, "Invalid option %s\n", xopts); + free(uopt); + return (-1); + } + + optidx++; + } + free(uopt); + + if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { + fprintf(stderr, "backing store not specified\n"); + return (-1); + } + if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) + sc->nvstore.sectsz = sectsz; + else if (sc->nvstore.type != NVME_STOR_RAM) + sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); + for (sc->nvstore.sectsz_bits = 9; + (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; + sc->nvstore.sectsz_bits++); + + if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) + sc->max_queues = NVME_QUEUES; + + if (sc->max_qentries <= 0) { + fprintf(stderr, "Invalid qsz option\n"); + return (-1); + } + if (sc->ioslots <= 0) { + fprintf(stderr, "Invalid ioslots option\n"); + return (-1); + } + + return (0); +} + +static int +pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_nvme_softc *sc; + uint32_t pci_membar_sz; + int error; + + error = 0; + + sc = calloc(1, sizeof(struct pci_nvme_softc)); + pi->pi_arg = sc; + sc->nsc_pi = pi; + + error = pci_nvme_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); + for (int i = 0; i < sc->ioslots; i++) { + if (i < (sc->ioslots-1)) + sc->ioreqs[i].next = &sc->ioreqs[i+1]; + pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); + pthread_cond_init(&sc->ioreqs[i].cv, NULL); + } + sc->ioreqs_free = sc->ioreqs; + sc->intr_coales_aggr_thresh = 1; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); + pci_set_cfgdata8(pi, PCIR_PROGIF, + PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); + + /* allocate size of nvme registers + doorbell space for all queues */ + pci_membar_sz = sizeof(struct nvme_registers) + + 2*sizeof(uint32_t)*(sc->max_queues + 1); + + DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); + if (error) { + WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); + goto done; + } + + error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); + if (error) { + WPRINTF(("%s pci add msixcap failed\r\n", __func__)); + goto done; + } + + pthread_mutex_init(&sc->mtx, NULL); + sem_init(&sc->iosemlock, 0, sc->ioslots); + + pci_nvme_reset(sc); + pci_nvme_init_ctrldata(sc); + pci_nvme_init_nsdata(sc); + + pci_lintr_request(pi); + +done: + return (error); +} + + +struct pci_devemu pci_de_nvme = { + .pe_emu = "nvme", + .pe_init = pci_nvme_init, + .pe_barwrite = pci_nvme_write, + .pe_barread = pci_nvme_read +}; +PCI_EMUL_SET(pci_de_nvme); diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c new file mode 100644 index 0000000000..3782914cd5 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_passthru.c @@ -0,0 +1,910 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/pciio.h> +#include <sys/ioctl.h> + +#include <sys/pci.h> + +#include <dev/io/iodev.h> +#include <dev/pci/pcireg.h> + +#include <machine/iodev.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <sysexits.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <vmmapi.h> +#include <sys/ppt_dev.h> +#include "pci_emul.h" +#include "mem.h" + +#define LEGACY_SUPPORT 1 + +#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) +#define MSIX_CAPLEN 12 + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + int pptfd; + int msi_limit; + int msix_limit; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct passthru_softc *sc, long reg, int width) +{ + struct ppt_cfg_io pi; + + pi.pci_off = reg; + pi.pci_width = width; + + if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) { + return (0); + } + return (pi.pci_data); +} + +static void +write_config(const struct passthru_softc *sc, long reg, int width, + uint32_t data) +{ + struct ppt_cfg_io pi; + + pi.pci_off = reg; + pi.pci_width = width; + pi.pci_data = data; + + (void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi); +} + +static int +passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type, + uint64_t *base, uint64_t *size) +{ + struct ppt_bar_query pb; + + pb.pbq_baridx = bar; + + if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) { + return (-1); + } + + switch (pb.pbq_type) { + case PCI_ADDR_IO: + *type = PCIBAR_IO; + break; + case PCI_ADDR_MEM32: + *type = PCIBAR_MEM32; + break; + case PCI_ADDR_MEM64: + *type = PCIBAR_MEM64; + break; + default: + err(1, "unrecognized BAR type: %u\n", pb.pbq_type); + break; + } + + *base = pb.pbq_base; + *size = pb.pbq_size; + return (0); +} + +static int +passthru_dev_open(const char *path, int *pptfdp) +{ + int pptfd; + + if ((pptfd = open(path, O_RDWR)) < 0) { + return (errno); + } + + /* XXX: verify fd with ioctl? */ + *pptfdp = pptfd; + return (0); +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static void +passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap) +{ + struct pci_devinst *pi = sc->psc_pi; + int off; + + /* Reduce the number of MSI vectors if higher than OS limit */ + if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) { + int msi_limit, mmc; + + msi_limit = + sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 : + sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 : + sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 : + sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 : + sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 : + PCIM_MSICTRL_MMC_1; + mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK; + + if (mmc > msi_limit) { + sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK; + sc->psc_msi.msgctrl |= msi_limit; + pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl); + } + } + + /* Reduce the number of MSI-X vectors if higher than OS limit */ + if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) { + if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) { + msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE; + msixcap->msgctrl |= sc->msix_limit - 1; + pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl); + } + } +} + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int i, ptr, capptr, cap, sts, caplen, table_size; + uint32_t u32; + struct pci_devinst *pi = sc->psc_pi; + struct msixcap msixcap; + uint32_t *msixcap_ptr; + + /* + * Parse the capabilities and cache the location of the MSI + * and MSI-X capabilities. + */ + sts = read_config(sc, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(sc, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(sc, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(sc, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + capptr = ptr; + while (caplen > 0) { + u32 = read_config(sc, capptr, 4); + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + } + } else if (cap == PCIY_MSIX) { + /* + * Copy the MSI-X capability + */ + sc->psc_msix.capoff = ptr; + caplen = 12; + msixcap_ptr = (uint32_t*) &msixcap; + capptr = ptr; + while (caplen > 0) { + u32 = read_config(sc, capptr, 4); + *msixcap_ptr = u32; + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + msixcap_ptr++; + } + } + ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1); + } + } + + passthru_intr_limit(sc, &msixcap); + + if (sc->psc_msix.capoff != 0) { + pi->pi_msix.pba_bar = + msixcap.pba_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.pba_offset = + msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_bar = + msixcap.table_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_offset = + msixcap.table_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); + + /* Allocate the emulated MSI-X table array */ + table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* Mask all table entries */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + pi->pi_msix.table[i].vector_control |= + PCIM_MSIX_VCTRL_MASK; + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(sc, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + /* Make sure one of the capabilities is present */ + if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) { + return (-1); + } else { + return (0); + } +} + +static uint64_t +passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *src8; + uint16_t *src16; + uint32_t *src32; + uint64_t *src64; + uint64_t data; + size_t entry_offset; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src64; + break; + default: + return (-1); + } + return (data); + } + + if (offset < pi->pi_msix.table_offset) + return (-1); + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return (-1); + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + switch(size) { + case 1: + src8 = (uint8_t *)((void *)entry + entry_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)((void *)entry + entry_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)((void *)entry + entry_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)((void *)entry + entry_offset); + data = *src64; + break; + default: + return (-1); + } + + return (data); +} + +static void +passthru_msix_table_write(struct vmctx *ctx, int vcpu, + struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *dest8; + uint16_t *dest16; + uint32_t *dest32; + uint64_t *dest64; + size_t entry_offset; + uint32_t vector_control; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest8 = data; + break; + case 2: + dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest16 = data; + break; + case 4: + dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest32 = data; + break; + case 8: + dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest64 = data; + break; + default: + break; + } + return; + } + + if (offset < pi->pi_msix.table_offset) + return; + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return; + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* Only 4 byte naturally-aligned writes are supported */ + assert(size == 4); + assert(entry_offset % 4 == 0); + + vector_control = entry->vector_control; + dest32 = (uint32_t *)((void *)entry + entry_offset); + *dest32 = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + (void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd, + index, entry->addr, entry->msg_data, + entry->vector_control); + } + } +} + +static int +init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) +{ + int error, idx; + size_t len, remaining; + uint32_t table_size, table_offset; + uint32_t pba_size, pba_offset; + vm_paddr_t start; + struct pci_devinst *pi = sc->psc_pi; + + assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); + + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. + */ + table_offset = rounddown2(pi->pi_msix.table_offset, 4096); + + table_size = pi->pi_msix.table_offset - table_offset; + table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + table_size = roundup2(table_size, 4096); + + idx = pi->pi_msix.table_bar; + start = pi->pi_bar[idx].addr; + remaining = pi->pi_bar[idx].size; + + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { + pba_offset = pi->pi_msix.pba_offset; + pba_size = pi->pi_msix.pba_size; + if (pba_offset >= table_offset + table_size || + table_offset >= pba_offset + pba_size) { + /* + * If the PBA does not share a page with the MSI-x + * tables, no PBA emulation is required. + */ + pi->pi_msix.pba_page = NULL; + pi->pi_msix.pba_page_offset = 0; + } else { + /* + * The PBA overlaps with either the first or last + * page of the MSI-X table region. Map the + * appropriate page. + */ + if (pba_offset <= table_offset) + pi->pi_msix.pba_page_offset = table_offset; + else + pi->pi_msix.pba_page_offset = table_offset + + table_size - 4096; + pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | + PROT_WRITE, MAP_SHARED, sc->pptfd, + pi->pi_msix.pba_page_offset); + if (pi->pi_msix.pba_page == MAP_FAILED) { + warn("Failed to map PBA page for MSI-X on %d", + sc->pptfd); + return (-1); + } + } + } + + /* Map everything before the MSI-X table */ + if (table_offset > 0) { + len = table_offset; + error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base); + if (error) + return (error); + + base += len; + start += len; + remaining -= len; + } + + /* Skip the MSI-X table */ + base += table_size; + start += table_size; + remaining -= table_size; + + /* Map everything beyond the end of the MSI-X table */ + if (remaining > 0) { + len = remaining; + error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base); + if (error) + return (error); + } + + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + struct pci_devinst *pi = sc->psc_pi; + uint_t i; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + enum pcibar_type bartype; + uint64_t base, size; + int error; + + if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) { + continue; + } + + if (bartype != PCIBAR_IO) { + if (((base | size) & PAGE_MASK) != 0) { + warnx("passthru device %d BAR %d: " + "base %#lx or size %#lx not page aligned\n", + sc->pptfd, i, base, size); + return (-1); + } + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = size; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_pbar(pi, i, base, bartype, size); + if (error) + return (-1); + + /* The MSI-X table needs special handling */ + if (i == pci_msix_table_bar(pi)) { + error = init_msix_table(ctx, sc, base); + if (error) + return (-1); + } else if (bartype != PCIBAR_IO) { + /* Map the physical BAR in the guest MMIO space */ + error = vm_map_pptdev_mmio(ctx, sc->pptfd, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct passthru_softc *sc) +{ + if (cfginitmsi(sc) != 0) { + warnx("failed to initialize MSI for PCI %d", sc->pptfd); + return (-1); + } + + if (cfginitbar(ctx, sc) != 0) { + warnx("failed to initialize BARs for PCI %d", sc->pptfd); + return (-1); + } + + return (0); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error, memflags, pptfd; + struct passthru_softc *sc; + + sc = NULL; + error = 1; + + memflags = vm_get_memflags(ctx); + if (!(memflags & VM_MEM_F_WIRED)) { + warnx("passthru requires guest memory to be wired"); + goto done; + } + + if (opts == NULL || passthru_dev_open(opts, &pptfd) != 0) { + warnx("invalid passthru options"); + goto done; + } + + if (vm_assign_pptdev(ctx, pptfd) != 0) { + warnx("PCI device at %d is not using the ppt driver", pptfd); + goto done; + } + + sc = calloc(1, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + sc->pptfd = pptfd; + + if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit, + &sc->msix_limit)) != 0) + goto done; + + /* initialize config space */ + if ((error = cfginit(ctx, sc)) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, pptfd); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +msixcap_access(struct passthru_softc *sc, int coff) +{ + if (sc->psc_msix.capoff == 0) + return (0); + + return (coff >= sc->psc_msix.capoff && + coff < sc->psc_msix.capoff + MSIX_CAPLEN); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + + /* + * MSI-X is also emulated since a limit on interrupts may be imposed by + * the OS, altering the perceived register state. + */ + if (msixcap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(sc, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int error, msix_table_entries, i; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd, + pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); + if (error != 0) + err(1, "vm_setup_pptdev_msi"); + return (0); + } + + if (msixcap_access(sc, coff)) { + msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + if (pi->pi_msix.enabled) { + msix_table_entries = pi->pi_msix.table_count; + for (i = 0; i < msix_table_entries; i++) { + error = vm_setup_pptdev_msix(ctx, vcpu, + sc->pptfd, i, + pi->pi_msix.table[i].addr, + pi->pi_msix.table[i].msg_data, + pi->pi_msix.table[i].vector_control); + + if (error) + err(1, "vm_setup_pptdev_msix"); + } + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(sc, coff, bytes, val); + + return (0); +} + +static void +passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct passthru_softc *sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + passthru_msix_table_write(ctx, vcpu, sc, offset, size, value); + } else { + struct ppt_bar_io pbi; + + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + + pbi.pbi_bar = baridx; + pbi.pbi_width = size; + pbi.pbi_off = offset; + pbi.pbi_data = value; + (void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi); + } +} + +static uint64_t +passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct passthru_softc *sc = pi->pi_arg; + uint64_t val; + + if (baridx == pci_msix_table_bar(pi)) { + val = passthru_msix_table_read(sc, offset, size); + } else { + struct ppt_bar_io pbi; + + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + + pbi.pbi_bar = baridx; + pbi.pbi_width = size; + pbi.pbi_off = offset; + if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) { + val = pbi.pbi_data; + } else { + val = 0; + } + } + + return (val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_barwrite = passthru_write, + .pe_barread = passthru_read, +}; +PCI_EMUL_SET(passthru); diff --git a/usr/src/cmd/bhyve/pci_uart.c b/usr/src/cmd/bhyve/pci_uart.c new file mode 100644 index 0000000000..093d0cb361 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_uart.c @@ -0,0 +1,121 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <stdio.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "uart_emul.h" + +/* + * Pick a PCI vid/did of a chip with a single uart at + * BAR0, that most versions of FreeBSD can understand: + * Siig CyberSerial 1-port. + */ +#define COM_VENDOR 0x131f +#define COM_DEV 0x2000 + +static void +pci_uart_intr_assert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_assert(pi); +} + +static void +pci_uart_intr_deassert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_deassert(pi); +} + +static void +pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + + assert(baridx == 0); + assert(size == 1); + + uart_write(pi->pi_arg, offset, value); +} + +uint64_t +pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + uint8_t val; + + assert(baridx == 0); + assert(size == 1); + + val = uart_read(pi->pi_arg, offset); + return (val); +} + +static int +pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); + pci_lintr_request(pi); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + + sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); + pi->pi_arg = sc; + + if (uart_set_backend(sc, opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' for " + "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func); + return (-1); + } + + return (0); +} + +struct pci_devemu pci_de_com = { + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read +}; +PCI_EMUL_SET(pci_de_com); diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c new file mode 100644 index 0000000000..b0c3b06187 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -0,0 +1,485 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <sys/disk.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <md5.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "block_if.h" + +#ifdef __FreeBSD__ +#define VTBLK_RINGSZ 64 +#else +/* Enlarge to match bigger BLOCKIF_IOV_MAX */ +#define VTBLK_RINGSZ 128 +#endif + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + 1 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; +#ifndef __FreeBSD__ + int vbsc_wce; +#endif + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); +#ifndef __FreeBSD__ +static void pci_vtblk_apply_feats(void *, uint64_t); +#endif + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ +#ifndef __FreeBSD__ + pci_vtblk_apply_feats, /* apply negotiated features */ +#else + NULL, /* apply negotiated features */ +#endif + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !\n")); + vi_reset_dev(&sc->vbsc_vs); +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif +} + +static void +pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) +{ + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(io, err); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct pci_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[idx]; + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read/ident", iolen, i - 1, + io->io_req.br_offset)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + pci_vtblk_done_locked(io, 0); + return; + default: + pci_vtblk_done_locked(io, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * The supplied backing file has to exist + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } + +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ +#ifdef __FreeBSD__ + sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX; +#else + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); +#endif + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); + return (1); + } + vi_set_io_bar(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +#ifndef __FreeBSD__ +void +pci_vtblk_apply_feats(void *vsc, uint64_t caps) +{ + struct pci_vtblk_softc *sc = vsc; + const int wce_next = ((caps & VTBLK_F_FLUSH) != 0) ? 1 : 0; + + if (sc->vbsc_wce != wce_next) { + (void) blockif_set_wce(sc->bc, wce_next); + sc->vbsc_wce = wce_next; + } +} +#endif /* __FreeBSD__ */ + +struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c new file mode 100644 index 0000000000..90437662df --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_console.c @@ -0,0 +1,701 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 iXsystems Inc. + * All rights reserved. + * + * This software was developed by Jakub Klama <jceel@FreeBSD.org> + * under sponsorship from iXsystems Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/linker_set.h> +#include <sys/uio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <libgen.h> +#include <sysexits.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "mevent.h" +#include "sockstream.h" + +#define VTCON_RINGSZ 64 +#define VTCON_MAXPORTS 16 +#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) + +#define VTCON_DEVICE_READY 0 +#define VTCON_DEVICE_ADD 1 +#define VTCON_DEVICE_REMOVE 2 +#define VTCON_PORT_READY 3 +#define VTCON_CONSOLE_PORT 4 +#define VTCON_CONSOLE_RESIZE 5 +#define VTCON_PORT_OPEN 6 +#define VTCON_PORT_NAME 7 + +#define VTCON_F_SIZE 0 +#define VTCON_F_MULTIPORT 1 +#define VTCON_F_EMERG_WRITE 2 +#define VTCON_S_HOSTCAPS \ + (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) + +static int pci_vtcon_debug; +#define DPRINTF(params) if (pci_vtcon_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtcon_softc; +struct pci_vtcon_port; +struct pci_vtcon_config; +typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, + int); + +struct pci_vtcon_port { + struct pci_vtcon_softc * vsp_sc; + int vsp_id; + const char * vsp_name; + bool vsp_enabled; + bool vsp_console; + bool vsp_rx_ready; + bool vsp_open; + int vsp_rxq; + int vsp_txq; + void * vsp_arg; + pci_vtcon_cb_t * vsp_cb; +}; + +struct pci_vtcon_sock +{ + struct pci_vtcon_port * vss_port; + const char * vss_path; + struct mevent * vss_server_evp; + struct mevent * vss_conn_evp; + int vss_server_fd; + int vss_conn_fd; + bool vss_open; +}; + +struct pci_vtcon_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTCON_MAXQ]; + pthread_mutex_t vsc_mtx; + uint64_t vsc_cfg; + uint64_t vsc_features; + char * vsc_rootdir; + int vsc_kq; + int vsc_nports; + bool vsc_ready; + struct pci_vtcon_port vsc_control_port; + struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; + struct pci_vtcon_config *vsc_config; +}; + +struct pci_vtcon_config { + uint16_t cols; + uint16_t rows; + uint32_t max_nr_ports; + uint32_t emerg_wr; +} __attribute__((packed)); + +struct pci_vtcon_control { + uint32_t id; + uint16_t event; + uint16_t value; +} __attribute__((packed)); + +struct pci_vtcon_console_resize { + uint16_t cols; + uint16_t rows; +} __attribute__((packed)); + +static void pci_vtcon_reset(void *); +static void pci_vtcon_notify_rx(void *, struct vqueue_info *); +static void pci_vtcon_notify_tx(void *, struct vqueue_info *); +static int pci_vtcon_cfgread(void *, int, int, uint32_t *); +static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); +static void pci_vtcon_neg_features(void *, uint64_t); +static void pci_vtcon_sock_accept(int, enum ev_type, void *); +static void pci_vtcon_sock_rx(int, enum ev_type, void *); +static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, + int); +static void pci_vtcon_control_send(struct pci_vtcon_softc *, + struct pci_vtcon_control *, const void *, size_t); +static void pci_vtcon_announce_port(struct pci_vtcon_port *); +static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); + +static struct virtio_consts vtcon_vi_consts = { + "vtcon", /* our name */ + VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ + sizeof(struct pci_vtcon_config), /* config reg size */ + pci_vtcon_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtcon_cfgread, /* read virtio config */ + pci_vtcon_cfgwrite, /* write virtio config */ + pci_vtcon_neg_features, /* apply negotiated features */ + VTCON_S_HOSTCAPS, /* our capabilities */ +}; + + +static void +pci_vtcon_reset(void *vsc) +{ + struct pci_vtcon_softc *sc; + + sc = vsc; + + DPRINTF(("vtcon: device reset requested!\n")); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtcon_softc *sc = vsc; + + sc->vsc_features = negotiated_features; +} + +static int +pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtcon_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline struct pci_vtcon_port * +pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) +{ + uint16_t num = vq->vq_num; + + if (num == 0 || num == 1) + return (&sc->vsc_ports[0]); + + if (num == 2 || num == 3) + return (&sc->vsc_control_port); + + return (&sc->vsc_ports[(num / 2) - 1]); +} + +static inline struct vqueue_info * +pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) +{ + int qnum; + + qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; + return (&port->vsp_sc->vsc_queues[qnum]); +} + +static struct pci_vtcon_port * +pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, + pci_vtcon_cb_t *cb, void *arg) +{ + struct pci_vtcon_port *port; + + if (sc->vsc_nports == VTCON_MAXPORTS) { + errno = EBUSY; + return (NULL); + } + + port = &sc->vsc_ports[sc->vsc_nports++]; + port->vsp_id = sc->vsc_nports - 1; + port->vsp_sc = sc; + port->vsp_name = name; + port->vsp_cb = cb; + port->vsp_arg = arg; + + if (port->vsp_id == 0) { + /* port0 */ + port->vsp_txq = 0; + port->vsp_rxq = 1; + } else { + port->vsp_txq = sc->vsc_nports * 2; + port->vsp_rxq = port->vsp_txq + 1; + } + + port->vsp_enabled = true; + return (port); +} + +static int +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, + const char *path) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + struct sockaddr_un sun; + char *pathcopy; +#else + /* Our compiler #defines 'sun' as '1'. Awesome. */ + struct sockaddr_un addr; +#endif + int s = -1, fd = -1, error = 0; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + sock = calloc(1, sizeof(struct pci_vtcon_sock)); + if (sock == NULL) { + error = -1; + goto out; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + error = -1; + goto out; + } + +#ifdef __FreeBSD__ + pathcopy = strdup(path); + if (pathcopy == NULL) { + error = -1; + goto out; + } + + fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); + if (fd < 0) { + free(pathcopy); + error = -1; + goto out; + } + + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(struct sockaddr_un); + strcpy(pathcopy, path); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + free(pathcopy); + + if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { + error = -1; + goto out; + } +#else /* __FreeBSD__ */ + /* Do a simple bind rather than the FreeBSD bindat() */ + addr.sun_family = AF_UNIX; + (void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path)); + if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { + error = -1; + goto out; + } +#endif /* __FreeBSD__ */ + + if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { + error = -1; + goto out; + } + + if (listen(s, 1) < 0) { + error = -1; + goto out; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + if (sock->vss_port == NULL) { + error = -1; + goto out; + } + + sock->vss_open = false; + sock->vss_conn_fd = -1; + sock->vss_server_fd = s; + sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, + sock); + + if (sock->vss_server_evp == NULL) { + error = -1; + goto out; + } + +out: + if (fd != -1) + close(fd); + + if (error != 0 && s != -1) + close(s); + + return (error); +} + +static void +pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + int s; + + s = accept(sock->vss_server_fd, NULL, NULL); + if (s < 0) + return; + + if (sock->vss_open) { + close(s); + return; + } + + sock->vss_open = true; + sock->vss_conn_fd = s; + sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); + + pci_vtcon_open_port(sock->vss_port, true); +} + +static void +pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_port *port; + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + struct vqueue_info *vq; + struct iovec iov; + static char dummybuf[2048]; + int len, n; + uint16_t idx; + + port = sock->vss_port; + vq = pci_vtcon_port_to_vq(port, true); + + if (!sock->vss_open || !port->vsp_rx_ready) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + if (len == 0) + goto close; + + return; + } + + if (!vq_has_descs(vq)) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + if (len == 0) + goto close; + + return; + } + + do { + n = vq_getchain(vq, &idx, &iov, 1, NULL); + len = readv(sock->vss_conn_fd, &iov, n); + + if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { + vq_retchain(vq); + vq_endchains(vq, 0); + if (len == 0) + goto close; + + return; + } + + vq_relchain(vq, idx, len); + } while (vq_has_descs(vq)); + + vq_endchains(vq, 1); + +close: + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; +} + +static void +pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + int i, ret; +#else + int i, ret = 0; +#endif + + sock = (struct pci_vtcon_sock *)arg; + + if (sock->vss_conn_fd == -1) + return; + + for (i = 0; i < niov; i++) { + ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, + iov[i].iov_len); + if (ret <= 0) + break; + } + + if (ret <= 0) { + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; + } +} + +static void +pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *tmp; + struct pci_vtcon_control resp, *ctrl; + int i; + + assert(niov == 1); + + sc = port->vsp_sc; + ctrl = (struct pci_vtcon_control *)iov->iov_base; + + switch (ctrl->event) { + case VTCON_DEVICE_READY: + sc->vsc_ready = true; + /* set port ready events for registered ports */ + for (i = 0; i < VTCON_MAXPORTS; i++) { + tmp = &sc->vsc_ports[i]; + if (tmp->vsp_enabled) + pci_vtcon_announce_port(tmp); + + if (tmp->vsp_open) + pci_vtcon_open_port(tmp, true); + } + break; + + case VTCON_PORT_READY: + if (ctrl->id >= sc->vsc_nports) { + WPRINTF(("VTCON_PORT_READY event for unknown port %d\n", + ctrl->id)); + return; + } + + tmp = &sc->vsc_ports[ctrl->id]; + if (tmp->vsp_console) { + resp.event = VTCON_CONSOLE_PORT; + resp.id = ctrl->id; + resp.value = 1; + pci_vtcon_control_send(sc, &resp, NULL, 0); + } + break; + } +} + +static void +pci_vtcon_announce_port(struct pci_vtcon_port *port) +{ + struct pci_vtcon_control event; + + event.id = port->vsp_id; + event.event = VTCON_DEVICE_ADD; + event.value = 1; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); + + event.event = VTCON_PORT_NAME; + pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, + strlen(port->vsp_name)); +} + +static void +pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) +{ + struct pci_vtcon_control event; + + if (!port->vsp_sc->vsc_ready) { + port->vsp_open = true; + return; + } + + event.id = port->vsp_id; + event.event = VTCON_PORT_OPEN; + event.value = (int)open; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); +} + +static void +pci_vtcon_control_send(struct pci_vtcon_softc *sc, + struct pci_vtcon_control *ctrl, const void *payload, size_t len) +{ + struct vqueue_info *vq; + struct iovec iov; + uint16_t idx; + int n; + + vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); + + if (!vq_has_descs(vq)) + return; + + n = vq_getchain(vq, &idx, &iov, 1, NULL); + + assert(n == 1); + + memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); + if (payload != NULL && len > 0) + memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), + payload, len); + + vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_endchains(vq, 1); +} + + +static void +pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + struct iovec iov[1]; + uint16_t idx, n; + uint16_t flags[8]; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, 1, flags); + assert(n >= 1); + if (port != NULL) + port->vsp_cb(port, port->vsp_arg, iov, 1); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, 0); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + if (!port->vsp_rx_ready) { + port->vsp_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtcon_softc *sc; + char *portname = NULL; + char *portpath = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtcon_softc)); + sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); + sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; + sc->vsc_config->cols = 80; + sc->vsc_config->rows = 25; + + vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + for (i = 0; i < VTCON_MAXQ; i++) { + sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; + sc->vsc_queues[i].vq_notify = i % 2 == 0 + ? pci_vtcon_notify_rx + : pci_vtcon_notify_tx; + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vsc_vs, 0); + + /* create control port */ + sc->vsc_control_port.vsp_sc = sc; + sc->vsc_control_port.vsp_txq = 2; + sc->vsc_control_port.vsp_rxq = 3; + sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; + sc->vsc_control_port.vsp_enabled = true; + + while ((opt = strsep(&opts, ",")) != NULL) { + portname = strsep(&opt, "="); + portpath = opt; + + /* create port */ + if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { + fprintf(stderr, "cannot create port %s: %s\n", + portname, strerror(errno)); + return (1); + } + } + + return (0); +} + +struct pci_devemu pci_de_vcon = { + .pe_emu = "virtio-console", + .pe_init = pci_vtcon_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vcon); diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c new file mode 100644 index 0000000000..74efbcaee1 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -0,0 +1,1169 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/linker_set.h> +#include <sys/select.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <machine/atomic.h> +#include <net/ethernet.h> +#ifdef __FreeBSD__ +#ifndef NETMAP_WITH_LIBS +#define NETMAP_WITH_LIBS +#endif +#include <net/netmap_user.h> +#endif + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <md5.h> +#include <pthread.h> +#include <pthread_np.h> +#include <sysexits.h> +#ifndef __FreeBSD__ +#include <poll.h> +#include <libdlpi.h> +#endif + +#include "bhyverun.h" +#include "pci_emul.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "virtio.h" + +#define VTNET_RINGSZ 1024 + +#define VTNET_MAXSEGS 256 + +/* + * Host capabilities. Note that we only offer a few of these. + */ +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ + +#define VTNET_S_HOSTCAPS \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 /* NB: not yet supported */ + +#define VTNET_MAXQ 3 + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + +#ifdef __FreeBSD + int vsc_tapfd; +#else + dlpi_handle_t vsc_dhp; + int vsc_dlpifd; +#endif + struct nm_desc *vsc_nmd; + + int vsc_rx_ready; + volatile int resetting; /* set and checked outside lock */ + + uint64_t vsc_features; /* negotiated features */ + + struct virtio_net_config vsc_config; + + pthread_mutex_t rx_mtx; + int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; + + void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); + void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, + int iovcnt, int len); +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +/* + * If the transmit thread is active then stall until it is done. + */ +static void +pci_vtnet_txwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * If the receive thread is active then stall until it is done. + */ +static void +pci_vtnet_rxwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->rx_mtx); + while (sc->rx_in_progress) { + pthread_mutex_unlock(&sc->rx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->rx_mtx); + } + pthread_mutex_unlock(&sc->rx_mtx); +} + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !\n")); + + sc->resetting = 1; + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); + + sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; +} + +/* + * Called to send a buffer chain out to the tap device + */ +#ifdef __FreeBSD__ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} +#else +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + (void) dlpi_send(sc->vsc_dhp, NULL, NULL, + iov[i].iov_base, iov[i].iov_len, NULL); + } +} +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; +#endif /* __FreeBSD__ */ + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= tlen); + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int n; +#ifdef __FreeBSD__ + int len; +#else + size_t len; + int ret; +#endif + uint16_t idx; + + /* + * Should never be called without a valid tap fd + */ +#ifdef __FreeBSD__ + assert(sc->vsc_tapfd != -1); +#else + assert(sc->vsc_dlpifd != -1); +#endif + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { +#ifdef __FreeBSD__ + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); +#endif + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ +#ifdef __FreeBSD__ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); +#endif + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); +#ifdef __FreeBSD__ + len = readv(sc->vsc_tapfd, riov, n); +#else + len = riov[0].iov_len; + ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, + (uint8_t *)riov[0].iov_base, &len, 0, NULL); + if (ret != DLPI_SUCCESS) { + errno = EWOULDBLOCK; + len = 0; + } +#endif + if (len <= 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, len + sc->rx_vhdrlen); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + +#ifdef __FreeBSD__ +static __inline int +pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int r, i; + int len = 0; + + for (r = nmd->cur_tx_ring; ; ) { + struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_tx_ring) + r = nmd->first_tx_ring; + if (r == nmd->cur_tx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + + for (i = 0; i < iovcnt; i++) { + if (len + iov[i].iov_len > 2048) + break; + memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); + len += iov[i].iov_len; + } + ring->slot[cur].len = len; + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_tx_ring = r; + ioctl(nmd->fd, NIOCTXSYNC, NULL); + break; + } + + return (len); +} + +static __inline int +pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int len = 0; + int i = 0; + int r; + + for (r = nmd->cur_rx_ring; ; ) { + struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + size_t left; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_rx_ring) + r = nmd->first_rx_ring; + if (r == nmd->cur_rx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + left = ring->slot[cur].len; + + for (i = 0; i < iovcnt && left > 0; i++) { + if (iov[i].iov_len > left) + iov[i].iov_len = left; + memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); + len += iov[i].iov_len; + left -= iov[i].iov_len; + } + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_rx_ring = r; + ioctl(nmd->fd, NIOCRXSYNC, NULL); + break; + } + for (; i < iovcnt; i++) + iov[i].iov_len = 0; + + return (len); +} + +/* + * Called to send a buffer chain out to the vale port + */ +static void +pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_nmd == NULL) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); +} + +static void +pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid netmap descriptor + */ + assert(sc->vsc_nmd != NULL); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + + if (len == 0) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, len + sc->rx_vhdrlen); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +static void +pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + sc->pci_vtnet_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + +} +#else +static void * +pci_vtnet_poll_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + pollfd_t pollset; + + pollset.fd = sc->vsc_dlpifd; + pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; + + for (;;) { + if (poll(&pollset, 1, -1) < 0) { + if (errno == EINTR) + continue; + fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno); + continue; + } + pthread_mutex_lock(&sc->vsc_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->vsc_mtx); + } + + return (NULL); +} +#endif /* __FreeBSD__ */ + +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + int i, n; + int plen, tlen; + uint16_t idx; + + /* + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; + } + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); + + /* chain is processed, release it and set tlen */ + vq_relchain(vq, idx, tlen); +} + +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, 1); + + pthread_mutex_lock(&sc->tx_mtx); + } + return (NULL); +} + +#ifdef __FreeBSD__ +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +static int +pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} +#endif /* __FreeBSD__ */ + +static void +pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif +#ifndef __FreeBSD__ + uchar_t physaddr[DLPI_PHYSADDR_MAX]; + size_t physaddrlen = DLPI_PHYSADDR_MAX; + int error; +#endif + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + sc->pci_vtnet_rx = pci_vtnet_tap_rx; + sc->pci_vtnet_tx = pci_vtnet_tap_tx; +#ifdef __FreeBSD__ + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + return; + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } +#else + if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { + WPRINTF(("open of vnic device %s failed\n", devname)); + } + + if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, + &physaddrlen) != DLPI_SUCCESS) { + WPRINTF(("read MAC address of vnic device %s failed\n", + devname)); + } + if (physaddrlen != ETHERADDRL) { + WPRINTF(("bad MAC address len %d on vnic device %s\n", + physaddrlen, devname)); + } + memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); + + if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { + WPRINTF(("bind of vnic device %s failed\n", devname)); + } + + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(physical) of vnic device %s " + "failed\n", devname)); + } + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(SAP) of vnic device %s " + "failed\n", devname)); + } + + sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); + + if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { + WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", + devname)); + dlpi_close(sc->vsc_dhp); + sc->vsc_dlpifd = -1; + } + + error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); + assert(error == 0); +#endif +} + +#ifdef __FreeBSD__ +static void +pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) +{ + sc->pci_vtnet_rx = pci_vtnet_netmap_rx; + sc->pci_vtnet_tx = pci_vtnet_netmap_tx; + + sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); + if (sc->vsc_nmd == NULL) { + WPRINTF(("open of netmap device %s failed\n", ifname)); + return; + } + + sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + nm_close(sc->vsc_nmd); + sc->vsc_nmd = NULL; + } +} +#endif /* __FreeBSD__ */ + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ +#ifdef __FreeBSD__ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; +#endif + char tname[MAXCOMLEN + 1]; + struct pci_vtnet_softc *sc; + const char *env_msi; + char *devname; + char *vtopts; +#ifdef __FreeBSD__ + int mac_provided; +#endif + int use_msix; + + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef __FreeBSD__ + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Use MSI if set by user + */ + use_msix = 1; + if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { + if (strcasecmp(env_msi, "yes") == 0) + use_msix = 0; + } + + /* + * Attempt to open the tap device and read the MAC address + * if specified + */ +#ifdef __FreeBSD__ + mac_provided = 0; + sc->vsc_tapfd = -1; +#endif + sc->vsc_nmd = NULL; + if (opts != NULL) { +#ifdef __FreeBSD__ + int err; +#endif + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + +#ifdef __FreBSD__ + if (vtopts != NULL) { + err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } +#endif + +#ifdef __FreeBSD__ + if (strncmp(devname, "vale", 4) == 0) + pci_vtnet_netmap_setup(sc, devname); +#endif + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + pci_vtnet_tap_setup(sc, devname); + + free(devname); + } + +#ifdef __FreeBSD__ + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_config.mac[0] = 0x00; + sc->vsc_config.mac[1] = 0xa0; + sc->vsc_config.mac[2] = 0x98; + sc->vsc_config.mac[3] = digest[0]; + sc->vsc_config.mac[4] = digest[1]; + sc->vsc_config.mac[5] = digest[2]; + } +#endif + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + /* Link is up if we managed to open tap device or vale port. */ +#ifdef __FreeBSD__ + sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || +#else + sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 || +#endif + sc->vsc_nmd != NULL); + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_in_progress = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->tx_tid, tname); + + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < 6) { + assert(offset + size <= 6); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vnet); diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c new file mode 100644 index 0000000000..5f470c03a6 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c @@ -0,0 +1,209 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/linker_set.h> +#include <sys/uio.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <sysexits.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !\n")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + close(fd); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vrsc_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vrnd = { + .pe_emu = "virtio-rnd", + .pe_init = pci_vtrnd_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c new file mode 100644 index 0000000000..238f07398b --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c @@ -0,0 +1,737 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <sys/sbuf.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> + +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_message.h> +#include <cam/ctl/ctl.h> +#include <cam/ctl/ctl_io.h> +#include <cam/ctl/ctl_backend.h> +#include <cam/ctl/ctl_ioctl.h> +#include <cam/ctl/ctl_util.h> +#include <cam/ctl/ctl_scsi_all.h> +#include <camlib.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "iov.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request <idx=%d> completed\n", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested\n")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + .seg_max = VTSCSI_MAXSEG, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + int data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + switch (cmd_rd->task_attr) { + case VIRTIO_SCSI_S_ORDERED: + io->scsiio.tag_type = CTL_TAG_ORDERED; + break; + case VIRTIO_SCSI_S_HEAD: + io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; + break; + case VIRTIO_SCSI_S_ACA: + io->scsiio.tag_type = CTL_TAG_ACA; + break; + case VIRTIO_SCSI_S_SIMPLE: + default: + io->scsiio.tag_type = CTL_TAG_SIMPLE; + break; + } + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, + bufsize - iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ + free(buf); +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char threadname[16]; + int i; + + queue->vsq_sc = sc; + queue->vsq_vq = &sc->vss_vq[num + 2]; + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + sprintf(threadname, "virtio-scsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, threadname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *opt, *optname; + const char *devname; + int i, optidx = 0; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + devname = "/dev/cam/ctl"; + while ((opt = strsep(&opts, ",")) != NULL) { + optname = strsep(&opt, "="); + if (opt == NULL && optidx == 0) { + if (optname[0] != 0) + devname = optname; + } else if (strcmp(optname, "dev") == 0 && opt != NULL) { + devname = opt; + } else if (strcmp(optname, "iid") == 0 && opt != NULL) { + sc->vss_iid = strtoul(opt, NULL, 10); + } else { + fprintf(stderr, "Invalid option %s\n", optname); + free(sc); + return (1); + } + optidx++; + } + + sc->vss_ctl_fd = open(devname, O_RDWR); + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open %s: %s\n", devname, strerror(errno))); + free(sc); + return (1); + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vss_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vscsi = { + .pe_emu = "virtio-scsi", + .pe_init = pci_vtscsi_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c new file mode 100644 index 0000000000..26c8cdeeba --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_viona.c @@ -0,0 +1,837 @@ +/* + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/ioctl.h> +#include <sys/viona_io.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <signal.h> +#include <poll.h> +#include <libdladm.h> +#include <libdllink.h> +#include <libdlvnic.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VIONA_RINGSZ 1024 + +/* + * PCI config-space register offsets + */ +#define VIONA_R_CFG0 24 +#define VIONA_R_CFG1 25 +#define VIONA_R_CFG2 26 +#define VIONA_R_CFG3 27 +#define VIONA_R_CFG4 28 +#define VIONA_R_CFG5 29 +#define VIONA_R_CFG6 30 +#define VIONA_R_CFG7 31 +#define VIONA_R_MAX 31 + +#define VIONA_REGSZ VIONA_R_MAX+1 + +/* + * Queue definitions. + */ +#define VIONA_RXQ 0 +#define VIONA_TXQ 1 +#define VIONA_CTLQ 2 + +#define VIONA_MAXQ 3 + +/* + * Debug printf + */ +static volatile int pci_viona_debug; +#define DPRINTF(params) if (pci_viona_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_viona_softc { + struct pci_devinst *vsc_pi; + pthread_mutex_t vsc_mtx; + + int vsc_curq; + int vsc_status; + int vsc_isr; + + datalink_id_t vsc_linkid; + int vsc_vnafd; + + /* Configurable parameters */ + char vsc_linkname[MAXLINKNAMELEN]; + uint32_t vsc_feature_mask; + uint16_t vsc_vq_size; + + uint32_t vsc_features; + uint8_t vsc_macaddr[6]; + + uint64_t vsc_pfn[VIONA_MAXQ]; + uint16_t vsc_msix_table_idx[VIONA_MAXQ]; + boolean_t vsc_msix_active; +}; + +/* + * Return the size of IO BAR that maps virtio header and device specific + * region. The size would vary depending on whether MSI-X is enabled or + * not. + */ +static uint64_t +pci_viona_iosize(struct pci_devinst *pi) +{ + if (pci_msix_enabled(pi)) + return (VIONA_REGSZ); + else + return (VIONA_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); +} + +static uint16_t +pci_viona_qsize(struct pci_viona_softc *sc, int qnum) +{ + /* XXX no ctl queue currently */ + if (qnum == VIONA_CTLQ) { + return (0); + } + + return (sc->vsc_vq_size); +} + +static void +pci_viona_ring_reset(struct pci_viona_softc *sc, int ring) +{ + assert(ring < VIONA_MAXQ); + + switch (ring) { + case VIONA_RXQ: + case VIONA_TXQ: + break; + case VIONA_CTLQ: + default: + return; + } + + for (;;) { + int res; + + res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring); + if (res == 0) { + break; + } else if (errno != EINTR) { + WPRINTF(("ioctl viona ring %d reset failed %d\n", + ring, errno)); + return; + } + } + + sc->vsc_pfn[ring] = 0; +} + +static void +pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value) +{ + + if (value == 0) { + DPRINTF(("viona: device reset requested !\n")); + pci_viona_ring_reset(sc, VIONA_RXQ); + pci_viona_ring_reset(sc, VIONA_TXQ); + } + + sc->vsc_status = value; +} + +static void * +pci_viona_poll_thread(void *param) +{ + struct pci_viona_softc *sc = param; + pollfd_t pollset; + const int fd = sc->vsc_vnafd; + + pollset.fd = fd; + pollset.events = POLLRDBAND; + + for (;;) { + if (poll(&pollset, 1, -1) < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } else { + WPRINTF(("pci_viona_poll_thread poll()" + "error %d\n", errno)); + break; + } + } + if (pollset.revents & POLLRDBAND) { + vioc_intr_poll_t vip; + uint_t i; + int res; + boolean_t assert_lintr = B_FALSE; + const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi); + + res = ioctl(fd, VNA_IOC_INTR_POLL, &vip); + for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) { + if (vip.vip_status[i] == 0) { + continue; + } + if (do_msix) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[i]); + } else { + assert_lintr = B_TRUE; + } + res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i); + if (res != 0) { + WPRINTF(("ioctl viona vq %d intr " + "clear failed %d\n", i, errno)); + } + } + if (assert_lintr) { + pthread_mutex_lock(&sc->vsc_mtx); + sc->vsc_isr |= VTCFG_ISR_QUEUES; + pci_lintr_assert(sc->vsc_pi); + pthread_mutex_unlock(&sc->vsc_mtx); + } + } + } + + pthread_exit(NULL); +} + +static void +pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn) +{ + int qnum = sc->vsc_curq; + vioc_ring_init_t vna_ri; + int error; + + assert(qnum < VIONA_MAXQ); + + if (qnum == VIONA_CTLQ) { + return; + } + + sc->vsc_pfn[qnum] = (pfn << VRING_PFN); + + vna_ri.ri_index = qnum; + vna_ri.ri_qsize = pci_viona_qsize(sc, qnum); + vna_ri.ri_qaddr = (pfn << VRING_PFN); + error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri); + + if (error != 0) { + WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno)); + } +} + +static int +pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) +{ + vioc_create_t vna_create; + int error; + + sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL); + if (sc->vsc_vnafd == -1) { + WPRINTF(("open viona ctl failed: %d\n", errno)); + return (-1); + } + + vna_create.c_linkid = sc->vsc_linkid; + vna_create.c_vmfd = vm_get_device_fd(ctx); + error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create); + if (error != 0) { + (void) close(sc->vsc_vnafd); + WPRINTF(("ioctl viona create failed %d\n", errno)); + return (-1); + } + + return (0); +} + +static int +pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts) +{ + char *next, *cp, *vnic = NULL; + int err = 0; + + sc->vsc_vq_size = VIONA_RINGSZ; + sc->vsc_feature_mask = 0; + + for (; opts != NULL && *opts != '\0'; opts = next) { + char *val; + + if ((cp = strchr(opts, ',')) != NULL) { + *cp = '\0'; + next = cp + 1; + } else { + next = NULL; + } + + if ((cp = strchr(opts, '=')) == NULL) { + /* vnic chosen with bare name */ + if (vnic != NULL) { + fprintf(stderr, + "viona: unexpected vnic name '%s'", opts); + err = -1; + } else { + vnic = opts; + } + continue; + } + + /* <param>=<value> handling */ + val = cp + 1; + *cp = '\0'; + if (strcmp(opts, "feature_mask") == 0) { + long num; + + errno = 0; + num = strtol(val, NULL, 0); + if (errno != 0 || num < 0) { + fprintf(stderr, + "viona: invalid mask '%s'", val); + } else { + sc->vsc_feature_mask = num; + } + } else if (strcmp(opts, "vqsize") == 0) { + long num; + + errno = 0; + num = strtol(val, NULL, 0); + if (errno != 0) { + fprintf(stderr, + "viona: invalid vsqize '%s'", val); + err = -1; + } else if (num <= 2 || num > 32768) { + fprintf(stderr, + "viona: vqsize out of range", num); + err = -1; + } else if ((1 << (ffs(num) - 1)) != num) { + fprintf(stderr, + "viona: vqsize must be power of 2", num); + err = -1; + } else { + sc->vsc_vq_size = num; + } + } else { + fprintf(stderr, + "viona: unrecognized option '%s'", opts); + err = -1; + } + } + if (vnic == NULL) { + fprintf(stderr, "viona: vnic name required"); + sc->vsc_linkname[0] = '\0'; + err = -1; + } else { + (void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN); + } + + DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc, + sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask)); + return (err); +} + +static int +pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + dladm_handle_t handle; + dladm_status_t status; + dladm_vnic_attr_t attr; + char errmsg[DLADM_STRSIZE]; + int error, i; + struct pci_viona_softc *sc; + uint64_t ioport; + + if (opts == NULL) { + printf("virtio-viona: vnic required\n"); + return (1); + } + + sc = malloc(sizeof (struct pci_viona_softc)); + memset(sc, 0, sizeof (struct pci_viona_softc)); + + pi->pi_arg = sc; + sc->vsc_pi = pi; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + if (pci_viona_parse_opts(sc, opts) != 0) { + free(sc); + return (1); + } + + if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) { + WPRINTF(("could not open /dev/dld")); + free(sc); + return (1); + } + + if (dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid, + NULL, NULL, NULL) != DLADM_STATUS_OK) { + WPRINTF(("dladm_name2info() for %s failed: %s\n", opts, + dladm_status2str(status, errmsg))); + dladm_close(handle); + free(sc); + return (1); + } + + if (dladm_vnic_info(handle, sc->vsc_linkid, &attr, + DLADM_OPT_ACTIVE) != DLADM_STATUS_OK) { + WPRINTF(("dladm_vnic_info() for %s failed: %s\n", opts, + dladm_status2str(status, errmsg))); + dladm_close(handle); + free(sc); + return (1); + } + + memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL); + + dladm_close(handle); + + error = pci_viona_viona_init(ctx, sc); + if (error != 0) { + free(sc); + return (1); + } + + error = pthread_create(NULL, NULL, pci_viona_poll_thread, sc); + assert(error == 0); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + /* MSI-X support */ + for (i = 0; i < VIONA_MAXQ; i++) + sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; + + /* BAR 1 used to map MSI-X table and PBA */ + if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) { + free(sc); + return (1); + } + + /* BAR 0 for legacy-style virtio register access. */ + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ); + if (error != NULL) { + WPRINTF(("could not allocate virtio BAR\n")); + free(sc); + return (1); + } + + /* Install ioport hook for virtqueue notification */ + ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY; + error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport); + if (error != 0) { + WPRINTF(("could not install ioport hook at %x\n", ioport)); + free(sc); + return (1); + } + + /* + * Need a legacy interrupt for virtio compliance, even though MSI-X + * operation is _strongly_ suggested for adequate performance. + */ + pci_lintr_request(pi); + + return (0); +} + +static uint64_t +viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) +{ + /* + * Device specific offsets used by guest would change based on + * whether MSI-X capability is enabled or not + */ + if (!pci_msix_enabled(pi)) { + if (offset >= VTCFG_R_MSIX) + return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); + } + + return (offset); +} + +static void +pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring) +{ + struct pci_viona_softc *sc = pi->pi_arg; + struct msix_table_entry mte; + uint16_t tab_index; + vioc_ring_msi_t vrm; + int res; + + assert(ring <= VIONA_VQ_TX); + + vrm.rm_index = ring; + vrm.rm_addr = 0; + vrm.rm_msg = 0; + tab_index = sc->vsc_msix_table_idx[ring]; + + if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) { + mte = pi->pi_msix.table[tab_index]; + if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + vrm.rm_addr = mte.addr; + vrm.rm_msg = mte.msg_data; + } + } + + res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm); + if (res != 0) { + WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno)); + } +} + +static void +pci_viona_lintrupdate(struct pci_devinst *pi) +{ + struct pci_viona_softc *sc = pi->pi_arg; + boolean_t msix_on = B_FALSE; + + pthread_mutex_lock(&sc->vsc_mtx); + msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0); + if ((sc->vsc_msix_active && !msix_on) || + (msix_on && !sc->vsc_msix_active)) { + uint_t i; + + sc->vsc_msix_active = msix_on; + /* Update in-kernel ring configs */ + for (i = 0; i <= VIONA_VQ_TX; i++) { + pci_viona_ring_set_msix(pi, i); + } + } + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset) +{ + struct pci_viona_softc *sc = pi->pi_arg; + uint_t tab_index, i; + + pthread_mutex_lock(&sc->vsc_mtx); + if (!sc->vsc_msix_active) { + pthread_mutex_unlock(&sc->vsc_mtx); + return; + } + + /* + * Rather than update every possible MSI-X vector, cheat and use the + * offset to calculate the entry within the table. Since this should + * only be called when a write to the table succeeds, the index should + * be valid. + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + for (i = 0; i <= VIONA_VQ_TX; i++) { + if (sc->vsc_msix_table_idx[i] != tab_index) { + continue; + } + pci_viona_ring_set_msix(pi, i); + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_viona_qnotify(struct pci_viona_softc *sc, int ring) +{ + int error; + + switch (ring) { + case VIONA_TXQ: + case VIONA_RXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring); + if (error != 0) { + WPRINTF(("ioctl viona ring %d kick failed %d\n", + ring, errno)); + } + break; + case VIONA_CTLQ: + DPRINTF(("viona: control qnotify!\n")); + break; + default: + break; + } +} + +static void +pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_viona_softc *sc = pi->pi_arg; + void *ptr; + int err = 0; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + if (pci_emul_msix_twrite(pi, offset, size, value) == 0) { + pci_viona_msix_update(pi, offset); + } + return; + } + + assert(baridx == 0); + + if (offset + size > pci_viona_iosize(pi)) { + DPRINTF(("viona_write: 2big, offset %ld size %d\n", + offset, size)); + return; + } + + pthread_mutex_lock(&sc->vsc_mtx); + + offset = viona_adjust_offset(pi, offset); + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + value &= ~(sc->vsc_feature_mask); + err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value); + if (err != 0) { + WPRINTF(("ioctl feature negotiation returned" + " err = %d\n", errno)); + } else { + sc->vsc_features = value; + } + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_viona_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + assert(value < VIONA_MAXQ); + sc->vsc_curq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value < VIONA_MAXQ); + pci_viona_qnotify(sc, value); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_viona_update_status(sc, value); + break; + case VTCFG_R_CFGVEC: + assert(size == 2); + sc->vsc_msix_table_idx[VIONA_CTLQ] = value; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VIONA_CTLQ); + sc->vsc_msix_table_idx[sc->vsc_curq] = value; + pci_viona_ring_set_msix(pi, sc->vsc_curq); + break; + case VIONA_R_CFG0: + case VIONA_R_CFG1: + case VIONA_R_CFG2: + case VIONA_R_CFG3: + case VIONA_R_CFG4: + case VIONA_R_CFG5: + assert((size + offset) <= (VIONA_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0]; + /* + * The driver is allowed to change the MAC address + */ + sc->vsc_macaddr[offset - VIONA_R_CFG0] = value; + if (size == 1) { + *(uint8_t *)ptr = value; + } else if (size == 2) { + *(uint16_t *)ptr = value; + } else { + *(uint32_t *)ptr = value; + } + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VIONA_R_CFG6: + case VIONA_R_CFG7: + DPRINTF(("viona: write to readonly reg %ld\n\r", offset)); + break; + default: + DPRINTF(("viona: unknown i/o write offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static uint64_t +pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_viona_softc *sc = pi->pi_arg; + void *ptr; + uint64_t value; + int err = 0; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + + assert(baridx == 0); + + if (offset + size > pci_viona_iosize(pi)) { + DPRINTF(("viona_read: 2big, offset %ld size %d\n", + offset, size)); + return (0); + } + + pthread_mutex_lock(&sc->vsc_mtx); + + offset = viona_adjust_offset(pi, offset); + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value); + if (err != 0) { + WPRINTF(("ioctl get host features returned" + " err = %d\n", errno)); + } + value &= ~sc->vsc_feature_mask; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; + break; + case VTCFG_R_QNUM: + assert(size == 2); + value = pci_viona_qsize(sc, sc->vsc_curq); + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vsc_isr; + sc->vsc_isr = 0; /* a read clears this flag */ + if (value != 0) { + pci_lintr_deassert(pi); + } + break; + case VTCFG_R_CFGVEC: + assert(size == 2); + value = sc->vsc_msix_table_idx[VIONA_CTLQ]; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VIONA_CTLQ); + value = sc->vsc_msix_table_idx[sc->vsc_curq]; + break; + case VIONA_R_CFG0: + case VIONA_R_CFG1: + case VIONA_R_CFG2: + case VIONA_R_CFG3: + case VIONA_R_CFG4: + case VIONA_R_CFG5: + assert((size + offset) <= (VIONA_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0]; + if (size == 1) { + value = *(uint8_t *)ptr; + } else if (size == 2) { + value = *(uint16_t *)ptr; + } else { + value = *(uint32_t *)ptr; + } + break; + case VIONA_R_CFG6: + assert(size != 4); + value = 0x01; /* XXX link always up */ + break; + case VIONA_R_CFG7: + assert(size == 1); + value = 0; /* XXX link status in LSB */ + break; + default: + DPRINTF(("viona: unknown i/o read offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); + + return (value); +} + +struct pci_devemu pci_de_viona = { + .pe_emu = "virtio-net-viona", + .pe_init = pci_viona_init, + .pe_barwrite = pci_viona_write, + .pe_barread = pci_viona_read, + .pe_lintrupdate = pci_viona_lintrupdate +}; +PCI_EMUL_SET(pci_de_viona); diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c new file mode 100644 index 0000000000..988e6933cc --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.c @@ -0,0 +1,2862 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + XHCI options: + -s <n>,xhci,{devices} + + devices: + tablet USB tablet mouse + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/uio.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <errno.h> +#include <pthread.h> +#include <unistd.h> + +#include <dev/usb/usbdi.h> +#include <dev/usb/usb.h> +#include <dev/usb/usb_freebsd.h> +#include <xhcireg.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "pci_xhci.h" +#include "usb_emul.h" + + +static int xhci_debug = 0; +#define DPRINTF(params) if (xhci_debug) printf params +#define WPRINTF(params) printf params + + +#define XHCI_NAME "xhci" +#define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ + +#define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ + +/* + * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping + * to 4k to avoid going over the guest physical memory barrier. + */ +#define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ + +#define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ + +#define XHCI_CAPLEN (4*8) /* offset of op register space */ +#define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ +#define XHCI_PORTREGS_START 0x400 +#define XHCI_DOORBELL_MAX 256 + +#define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ + +/* caplength and hci-version registers */ +#define XHCI_SET_CAPLEN(x) ((x) & 0xFF) +#define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) +#define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) + +/* hcsparams1 register */ +#define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) +#define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) +#define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) + +/* hcsparams2 register */ +#define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) +#define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) +#define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) +#define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) + +/* hcsparams3 register */ +#define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) +#define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) + +/* hccparams1 register */ +#define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) +#define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) +#define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) +#define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) +#define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) +#define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) +#define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) +#define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) +#define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) +#define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) + +/* hccparams2 register */ +#define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) +#define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) + +/* other registers */ +#define XHCI_SET_DOORBELL(x) ((x) & ~0x03) +#define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) + +/* register masks */ +#define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ +#define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ +#define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ + +/* port register set */ +#define XHCI_PORTREGS_BASE 0x400 /* base offset */ +#define XHCI_PORTREGS_PORT0 0x3F0 +#define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ + +#define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) +#define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) + +#define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & (m)) << (s))) +#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & ((m) << (s))))) + +struct pci_xhci_trb_ring { + uint64_t ringaddr; /* current dequeue guest address */ + uint32_t ccs; /* consumer cycle state */ +}; + +/* device endpoint transfer/stream rings */ +struct pci_xhci_dev_ep { + union { + struct xhci_trb *_epu_tr; + struct xhci_stream_ctx *_epu_sctx; + } _ep_trbsctx; +#define ep_tr _ep_trbsctx._epu_tr +#define ep_sctx _ep_trbsctx._epu_sctx + + union { + struct pci_xhci_trb_ring _epu_trb; + struct pci_xhci_trb_ring *_epu_sctx_trbs; + } _ep_trb_rings; +#define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr +#define ep_ccs _ep_trb_rings._epu_trb.ccs +#define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs + + struct usb_data_xfer *ep_xfer; /* transfer chain */ +}; + +/* device context base address array: maps slot->device context */ +struct xhci_dcbaa { + uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ +}; + +/* port status registers */ +struct pci_xhci_portregs { + uint32_t portsc; /* port status and control */ + uint32_t portpmsc; /* port pwr mgmt status & control */ + uint32_t portli; /* port link info */ + uint32_t porthlpmc; /* port hardware LPM control */ +} __packed; +#define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) + +/* xHC operational registers */ +struct pci_xhci_opregs { + uint32_t usbcmd; /* usb command */ + uint32_t usbsts; /* usb status */ + uint32_t pgsz; /* page size */ + uint32_t dnctrl; /* device notification control */ + uint64_t crcr; /* command ring control */ + uint64_t dcbaap; /* device ctx base addr array ptr */ + uint32_t config; /* configure */ + + /* guest mapped addresses: */ + struct xhci_trb *cr_p; /* crcr dequeue */ + struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ +}; + +/* xHC runtime registers */ +struct pci_xhci_rtsregs { + uint32_t mfindex; /* microframe index */ + struct { /* interrupter register set */ + uint32_t iman; /* interrupter management */ + uint32_t imod; /* interrupter moderation */ + uint32_t erstsz; /* event ring segment table size */ + uint32_t rsvd; + uint64_t erstba; /* event ring seg-tbl base addr */ + uint64_t erdp; /* event ring dequeue ptr */ + } intrreg __packed; + + /* guest mapped addresses */ + struct xhci_event_ring_seg *erstba_p; + struct xhci_trb *erst_p; /* event ring segment tbl */ + int er_deq_seg; /* event ring dequeue segment */ + int er_enq_idx; /* event ring enqueue index - xHCI */ + int er_enq_seg; /* event ring enqueue segment */ + uint32_t er_events_cnt; /* number of events in ER */ + uint32_t event_pcs; /* producer cycle state flag */ +}; + + +struct pci_xhci_softc; + + +/* + * USB device emulation container. + * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each + * emulated device instance. + */ +struct pci_xhci_dev_emu { + struct pci_xhci_softc *xsc; + + /* XHCI contexts */ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; + int dev_slotstate; + + struct usb_devemu *dev_ue; /* USB emulated dev */ + void *dev_sc; /* device's softc */ + + struct usb_hci hci; +}; + +struct pci_xhci_softc { + struct pci_devinst *xsc_pi; + + pthread_mutex_t mtx; + + uint32_t caplength; /* caplen & hciversion */ + uint32_t hcsparams1; /* structural parameters 1 */ + uint32_t hcsparams2; /* structural parameters 2 */ + uint32_t hcsparams3; /* structural parameters 3 */ + uint32_t hccparams1; /* capability parameters 1 */ + uint32_t dboff; /* doorbell offset */ + uint32_t rtsoff; /* runtime register space offset */ + uint32_t hccparams2; /* capability parameters 2 */ + + uint32_t regsend; /* end of configuration registers */ + + struct pci_xhci_opregs opregs; + struct pci_xhci_rtsregs rtsregs; + + struct pci_xhci_portregs *portregs; + struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ + struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ + int ndevices; + + int usb2_port_start; + int usb3_port_start; +}; + + +/* portregs and devices arrays are set up to start from idx=1 */ +#define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)] +#define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)] +#define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)] + +#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) + +#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ + (a), \ + XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + +static int xhci_in_use; + +/* map USB errors to XHCI */ +static const int xhci_usb_errors[USB_ERR_MAX] = { + [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, + [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, + [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, + [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, + [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, + [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, + [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, + [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, +}; +#define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ + XHCI_TRB_ERROR_INVALID) + +static int pci_xhci_insert_event(struct pci_xhci_softc *sc, + struct xhci_trb *evtrb, int do_intr); +static void pci_xhci_dump_trb(struct xhci_trb *trb); +static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); +static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); +static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); +static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t streamid, + uint64_t ringaddr, int ccs); + +static void +pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, + uint32_t evtype) +{ + evtrb->qwTrb0 = port << 24; + evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); + evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); +} + + +/* controller reset */ +static void +pci_xhci_reset(struct pci_xhci_softc *sc) +{ + int i; + + sc->rtsregs.er_enq_idx = 0; + sc->rtsregs.er_events_cnt = 0; + sc->rtsregs.event_pcs = 1; + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + pci_xhci_reset_slot(sc, i); + } +} + +static uint32_t +pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) +{ + int do_intr = 0; + int i; + + if (cmd & XHCI_CMD_RS) { + do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; + + sc->opregs.usbcmd |= XHCI_CMD_RS; + sc->opregs.usbsts &= ~XHCI_STS_HCH; + sc->opregs.usbsts |= XHCI_STS_PCD; + + /* Queue port change event on controller run from stop */ + if (do_intr) + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + struct pci_xhci_dev_emu *dev; + struct pci_xhci_portregs *port; + struct xhci_trb evtrb; + + if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) + continue; + + port = XHCI_PORTREG_PTR(sc, i); + port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; + port->portsc &= ~XHCI_PS_PLS_MASK; + + /* + * XHCI 4.19.3 USB2 RxDetect->Polling, + * USB3 Polling->U0 + */ + if (dev->dev_ue->ue_usbver == 2) + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); + else + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_U0); + + pci_xhci_set_evtrb(&evtrb, i, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + if (pci_xhci_insert_event(sc, &evtrb, 0) != + XHCI_TRB_ERROR_SUCCESS) + break; + } + } else { + sc->opregs.usbcmd &= ~XHCI_CMD_RS; + sc->opregs.usbsts |= XHCI_STS_HCH; + sc->opregs.usbsts &= ~XHCI_STS_PCD; + } + + /* start execution of schedule; stop when set to 0 */ + cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; + + if (cmd & XHCI_CMD_HCRST) { + /* reset controller */ + pci_xhci_reset(sc); + cmd &= ~XHCI_CMD_HCRST; + } + + cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (cmd); +} + +static void +pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct xhci_trb evtrb; + struct pci_xhci_portregs *p; + int port; + uint32_t oldpls, newpls; + + if (sc->portregs == NULL) + return; + + port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; + offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; + + DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx\r\n", + offset, port, value)); + + assert(port >= 0); + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_write port %d > ndevices\r\n", + port)); + return; + } + + if (XHCI_DEVINST_PTR(sc, port) == NULL) { + DPRINTF(("pci_xhci: portregs_write to unattached port %d\r\n", + port)); + } + + p = XHCI_PORTREG_PTR(sc, port); + switch (offset) { + case 0: + /* port reset or warm reset */ + if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { + pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); + break; + } + + if ((p->portsc & XHCI_PS_PP) == 0) { + WPRINTF(("pci_xhci: portregs_write to unpowered " + "port %d\r\n", port)); + break; + } + + /* Port status and control register */ + oldpls = XHCI_PS_PLS_GET(p->portsc); + newpls = XHCI_PS_PLS_GET(value); + + p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | + XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; + + if (XHCI_DEVINST_PTR(sc, port)) + p->portsc |= XHCI_PS_CCS; + + p->portsc |= (value & + ~(XHCI_PS_OCA | + XHCI_PS_PR | + XHCI_PS_PED | + XHCI_PS_PLS_MASK | /* link state */ + XHCI_PS_SPEED_MASK | + XHCI_PS_PIC_MASK | /* port indicator */ + XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); + + /* clear control bits */ + p->portsc &= ~(value & + (XHCI_PS_CSC | + XHCI_PS_PEC | + XHCI_PS_WRC | + XHCI_PS_OCC | + XHCI_PS_PRC | + XHCI_PS_PLC | + XHCI_PS_CEC | + XHCI_PS_CAS)); + + /* port disable request; for USB3, don't care */ + if (value & XHCI_PS_PED) + DPRINTF(("Disable port %d request\r\n", port)); + + if (!(value & XHCI_PS_LWS)) + break; + + DPRINTF(("Port new PLS: %d\r\n", newpls)); + switch (newpls) { + case 0: /* U0 */ + case 3: /* U3 */ + if (oldpls != newpls) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(newpls) | + XHCI_PS_PLC; + + if (oldpls != 0 && newpls == 0) { + pci_xhci_set_evtrb(&evtrb, port, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + pci_xhci_insert_event(sc, &evtrb, 1); + } + } + break; + + default: + DPRINTF(("Unhandled change port %d PLS %u\r\n", + port, newpls)); + break; + } + break; + case 4: + /* Port power management status and control register */ + p->portpmsc = value; + break; + case 8: + /* Port link information register */ + DPRINTF(("pci_xhci attempted write to PORTLI, port %d\r\n", + port)); + break; + case 12: + /* + * Port hardware LPM control register. + * For USB3, this register is reserved. + */ + p->porthlpmc = value; + break; + } +} + +struct xhci_dev_ctx * +pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) +{ + uint64_t devctx_addr; + struct xhci_dev_ctx *devctx; + + assert(slot > 0 && slot <= sc->ndevices); + assert(sc->opregs.dcbaa_p != NULL); + + devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; + + if (devctx_addr == 0) { + DPRINTF(("get_dev_ctx devctx_addr == 0\r\n")); + return (NULL); + } + + DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx\r\n", + slot, devctx_addr)); + devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); + + return (devctx); +} + +struct xhci_trb * +pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, + uint64_t *guestaddr) +{ + struct xhci_trb *next; + + assert(curtrb != NULL); + + if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { + if (guestaddr) + *guestaddr = curtrb->qwTrb0 & ~0xFUL; + + next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); + } else { + if (guestaddr) + *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; + + next = curtrb + 1; + } + + return (next); +} + +static void +pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) +{ + + sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; + sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; + sc->opregs.usbsts |= XHCI_STS_EINT; + + /* only trigger interrupt if permitted */ + if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && + (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { + if (pci_msi_enabled(sc->xsc_pi)) + pci_generate_msi(sc->xsc_pi, 0); + else + pci_lintr_assert(sc->xsc_pi); + } +} + +static void +pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) +{ + + if (!pci_msi_enabled(sc->xsc_pi)) + pci_lintr_assert(sc->xsc_pi); +} + +static void +pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + uint32_t pstreams; + int i; + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); + if (pstreams > 0) { + DPRINTF(("init_ep %d with pstreams %d\r\n", epid, pstreams)); + assert(devep->ep_sctx_trbs == NULL); + + devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK); + devep->ep_sctx_trbs = calloc(pstreams, + sizeof(struct pci_xhci_trb_ring)); + for (i = 0; i < pstreams; i++) { + devep->ep_sctx_trbs[i].ringaddr = + devep->ep_sctx[i].qwSctx0 & + XHCI_SCTX_0_TR_DQ_PTR_MASK; + devep->ep_sctx_trbs[i].ccs = + XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); + } + } else { + DPRINTF(("init_ep %d with no pstreams\r\n", epid)); + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK; + devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); + devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); + DPRINTF(("init_ep tr DCS %x\r\n", devep->ep_ccs)); + } + + if (devep->ep_xfer == NULL) { + devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); + USB_DATA_XFER_INIT(devep->ep_xfer); + } +} + +static void +pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + + DPRINTF(("pci_xhci disable_ep %d\r\n", epid)); + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; + + devep = &dev->eps[epid]; + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 && + devep->ep_sctx_trbs != NULL) + free(devep->ep_sctx_trbs); + + if (devep->ep_xfer != NULL) { + free(devep->ep_xfer); + devep->ep_xfer = NULL; + } + + memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); +} + + +/* reset device at slot and data structures related to it */ +static void +pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) +{ + struct pci_xhci_dev_emu *dev; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + + if (!dev) { + DPRINTF(("xhci reset unassigned slot (%d)?\r\n", slot)); + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + } + + /* TODO: reset ring buffer pointers */ +} + +static int +pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, + int do_intr) +{ + struct pci_xhci_rtsregs *rts; + uint64_t erdp; + int erdp_idx; + int err; + struct xhci_trb *evtrbptr; + + err = XHCI_TRB_ERROR_SUCCESS; + + rts = &sc->rtsregs; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]\r\n" + "\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u\r\n" + "\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)\r\n", + evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3, + erdp_idx, rts->er_deq_seg, rts->er_enq_idx, + rts->er_enq_seg, + rts->event_pcs, erdp, rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize, do_intr)); + + evtrbptr = &rts->erst_p[rts->er_enq_idx]; + + /* TODO: multi-segment table */ + if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { + DPRINTF(("pci_xhci[%d] cannot insert event; ring full\r\n", + __LINE__)); + err = XHCI_TRB_ERROR_EV_RING_FULL; + goto done; + } + + if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { + struct xhci_trb errev; + + if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { + + DPRINTF(("pci_xhci[%d] insert evt err: ring full\r\n", + __LINE__)); + + errev.qwTrb0 = 0; + errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( + XHCI_TRB_ERROR_EV_RING_FULL); + errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( + XHCI_TRB_EVENT_HOST_CTRL) | + rts->event_pcs; + rts->er_events_cnt++; + memcpy(&rts->erst_p[rts->er_enq_idx], &errev, + sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + err = XHCI_TRB_ERROR_EV_RING_FULL; + do_intr = 1; + + goto done; + } + } else { + rts->er_events_cnt++; + } + + evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; + evtrb->dwTrb3 |= rts->event_pcs; + + memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + + if (rts->er_enq_idx == 0) + rts->event_pcs ^= 1; + +done: + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (err); +} + +static uint32_t +pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs != NULL) + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + dev = XHCI_SLOTDEV_PTR(sc, i); + if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { + *slot = i; + dev->dev_slotstate = XHCI_ST_ENABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + dev->hci.hci_address = i; + break; + } + } + + DPRINTF(("pci_xhci enable slot (error=%d) slot %u\r\n", + cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); + + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + + DPRINTF(("pci_xhci disable slot %u\r\n", slot)); + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + if (slot > sc->ndevices) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (dev) { + if (dev->dev_slotstate == XHCI_ST_DISABLED) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + /* TODO: reset events and endpoints */ + } + } + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + DPRINTF(("pci_xhci reset device slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + else { + dev->dev_slotstate = XHCI_ST_DEFAULT; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, + 0x1F, 27); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* reset all eps other than ep-0 */ + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_DISABLED, 0x7, 0); + } + + cmderr = XHCI_TRB_ERROR_SUCCESS; + } + + pci_xhci_reset_slot(sc, slot); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { + DPRINTF(("pci_xhci: address device, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: address device, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + dev->hci.hci_address = slot; + dev->dev_ctx = dev_ctx; + + if (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + + memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); + + dev_ctx->ctx_slot.dwSctx3 = + XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | + XHCI_SCTX_3_DEV_ADDR_SET(slot); + + memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); + ep0_ctx = &dev_ctx->ctx_ep[1]; + ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | + XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); + + pci_xhci_init_ep(dev, 1); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + DPRINTF(("pci_xhci: address device, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx, *iep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci config_ep slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { + DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u\r\n", + slot)); + if (dev->dev_ue->ue_stop != NULL) + dev->dev_ue->ue_stop(dev->dev_sc); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, + 0x1F, 27); + + /* disable endpoints */ + for (i = 2; i < 32; i++) + pci_xhci_disable_ep(dev, i); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + goto done; + } + + if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { + DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed\r\n", + dev->dev_slotstate)); + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + /* In addressed/configured state; + * for each drop endpoint ctx flag: + * ep->state = DISABLED + * for each add endpoint ctx flag: + * cp(ep-in, ep-out) + * ep->state = RUNNING + * for each drop+add endpoint flag: + * reset ep resources + * cp(ep-in, ep-out) + * ep->state = RUNNING + * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) + * slot->state = configured + */ + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + dev_ctx = dev->dev_ctx; + DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + input_ctx->ctx_input.dwInCtx7)); + + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + + if (input_ctx->ctx_input.dwInCtx0 & + XHCI_INCTX_0_DROP_MASK(i)) { + DPRINTF((" config ep - dropping ep %d\r\n", i)); + pci_xhci_disable_ep(dev, i); + } + + if (input_ctx->ctx_input.dwInCtx1 & + XHCI_INCTX_1_ADD_MASK(i)) { + iep_ctx = &input_ctx->ctx_ep[i]; + + DPRINTF((" enable ep[%d] %08x %08x %016lx %08x\r\n", + i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, + iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); + + memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); + + pci_xhci_init_ep(dev, i); + + /* ep state */ + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + } + } + + /* slot state to configured */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); + dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); + dev->dev_slotstate = XHCI_ST_CONFIGURED; + + DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " + "[3]=0x%08x\r\n", + slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t type; + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + + DPRINTF(("pci_xhci: reset ep %u: slot %u\r\n", epid, slot)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if (type == XHCI_TRB_TYPE_STOP_EP && + (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { + /* XXX suspend endpoint for 10ms */ + } + + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: reset ep: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + devep = &dev->eps[epid]; + if (devep->ep_xfer != NULL) + USB_DATA_XFER_RESET(devep->ep_xfer); + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0) + ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; + + DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (type == XHCI_TRB_TYPE_RESET_EP && + (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + +done: + return (cmderr); +} + + +static uint32_t +pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, + uint32_t streamid, struct xhci_stream_ctx **osctx) +{ + struct xhci_stream_ctx *sctx; + uint32_t maxpstreams; + + maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0); + if (maxpstreams == 0) + return (XHCI_TRB_ERROR_TRB); + + if (maxpstreams > XHCI_STREAMS_MAX) + return (XHCI_TRB_ERROR_INVALID_SID); + + if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { + DPRINTF(("pci_xhci: find_stream; LSA bit not set\r\n")); + return (XHCI_TRB_ERROR_INVALID_SID); + } + + /* only support primary stream */ + if (streamid > maxpstreams) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; + if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + *osctx = sctx; + + return (XHCI_TRB_ERROR_SUCCESS); +} + + +static uint32_t +pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t streamid; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u\r\n" + " stream-id %u, slot %u, epid %u, C %u\r\n", + (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), + (uint32_t)(trb->qwTrb0 & 0x1), (trb->dwTrb2 >> 16) & 0xFFFF, + XHCI_TRB_3_SLOT_GET(trb->dwTrb3), + XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + + switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { + case XHCI_ST_EPCTX_STOPPED: + case XHCI_ST_EPCTX_ERROR: + break; + default: + DPRINTF(("pci_xhci cmd set_tr invalid state %x\r\n", + XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); + cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; + goto done; + } + + streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) { + struct xhci_stream_ctx *sctx; + + sctx = NULL; + cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); + if (sctx != NULL) { + assert(devep->ep_sctx != NULL); + + devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; + devep->ep_sctx_trbs[streamid].ringaddr = + trb->qwTrb0 & ~0xF; + devep->ep_sctx_trbs[streamid].ccs = + XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); + } + } else { + if (streamid != 0) { + DPRINTF(("pci_xhci cmd set_tr streamid %x != 0\r\n", + streamid)); + } + ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; + devep->ep_ccs = trb->qwTrb0 & 0x1; + devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); + + DPRINTF(("pci_xhci set_tr first TRB:\r\n")); + pci_xhci_dump_trb(devep->ep_tr); + } + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { + DPRINTF(("pci_xhci: eval ctx, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot; in this emulation, slot_id = address */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: eval ctx, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ + /* set max exit latency */ + dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, + 0xFFFF, 0); + + /* set interrupter target */ + dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, + 0x3FF, 22); + } + if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ + /* set max packet size */ + dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( + dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, + 0xFFFF, 16); + + ep0_ctx = &dev_ctx->ctx_ep[1]; + } + + DPRINTF(("pci_xhci: eval ctx, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static int +pci_xhci_complete_commands(struct pci_xhci_softc *sc) +{ + struct xhci_trb evtrb; + struct xhci_trb *trb; + uint64_t crcr; + uint32_t ccs; /* cycle state (XHCI 4.9.2) */ + uint32_t type; + uint32_t slot; + uint32_t cmderr; + int error; + + error = 0; + sc->opregs.crcr |= XHCI_CRCR_LO_CRR; + + trb = sc->opregs.cr_p; + ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; + crcr = sc->opregs.crcr & ~0xF; + + while (1) { + sc->opregs.cr_p = trb; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) + break; + + DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" + " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u\r\n", + type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + evtrb.dwTrb2 = 0; + evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | + XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); + slot = 0; + + switch (type) { + case XHCI_TRB_TYPE_LINK: /* 0x06 */ + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= XHCI_CRCR_LO_RCS; + break; + + case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ + cmderr = pci_xhci_cmd_enable_slot(sc, &slot); + break; + + case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_disable_slot(sc, slot); + break; + + case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_address_device(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ + DPRINTF(("Reset Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ + DPRINTF(("Stop Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_device(sc, slot); + break; + + case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ + /* TODO: */ + break; + + case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ + break; + + case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ + break; + + case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ + break; + + case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ + break; + + case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ + break; + + default: + DPRINTF(("pci_xhci: unsupported cmd %x\r\n", type)); + break; + } + + if (type != XHCI_TRB_TYPE_LINK) { + /* + * insert command completion event and assert intr + */ + evtrb.qwTrb0 = crcr; + evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); + evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); + DPRINTF(("pci_xhci: command 0x%x result: 0x%x\r\n", + type, cmderr)); + pci_xhci_insert_event(sc, &evtrb, 1); + } + + trb = pci_xhci_trb_next(sc, trb, &crcr); + } + + sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; + sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; + return (error); +} + +static void +pci_xhci_dump_trb(struct xhci_trb *trb) +{ + static const char *trbtypes[] = { + "RESERVED", + "NORMAL", + "SETUP_STAGE", + "DATA_STAGE", + "STATUS_STAGE", + "ISOCH", + "LINK", + "EVENT_DATA", + "NOOP", + "ENABLE_SLOT", + "DISABLE_SLOT", + "ADDRESS_DEVICE", + "CONFIGURE_EP", + "EVALUATE_CTX", + "RESET_EP", + "STOP_EP", + "SET_TR_DEQUEUE", + "RESET_DEVICE", + "FORCE_EVENT", + "NEGOTIATE_BW", + "SET_LATENCY_TOL", + "GET_PORT_BW", + "FORCE_HEADER", + "NOOP_CMD" + }; + uint32_t type; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x\r\n", + trb, type, + type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", + trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); +} + +static int +pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, + uint32_t slot, uint32_t epid, int *do_intr) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct xhci_trb *trb; + struct xhci_trb evtrb; + uint32_t trbflags; + uint32_t edtla; + int i, err; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + err = XHCI_TRB_ERROR_SUCCESS; + *do_intr = 0; + edtla = 0; + + /* go through list of TRBs and insert event(s) */ + for (i = xfer->head; xfer->ndata > 0; ) { + evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; + trb = XHCI_GADDR(sc, evtrb.qwTrb0); + trbflags = trb->dwTrb3; + + DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " + "(err %d) IOC?%d\r\n", + i, xfer->data[i].processed, xfer->data[i].blen, + XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, + trbflags, err, + trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); + + if (!xfer->data[i].processed) { + xfer->head = i; + break; + } + + xfer->ndata--; + edtla += xfer->data[i].bdone; + + trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); + + pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, + xfer->data[i].streamid, xfer->data[i].trbnext, + xfer->data[i].ccs); + + /* Only interrupt if IOC or short packet */ + if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && + !((err == XHCI_TRB_ERROR_SHORT_PKT) && + (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + continue; + } + + evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | + XHCI_TRB_2_REM_SET(xfer->data[i].blen); + + evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | + XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); + + if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { + DPRINTF(("pci_xhci EVENT_DATA edtla %u\r\n", edtla)); + evtrb.qwTrb0 = trb->qwTrb0; + evtrb.dwTrb2 = (edtla & 0xFFFFF) | + XHCI_TRB_2_ERROR_SET(err); + evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; + edtla = 0; + } + + *do_intr = 1; + + err = pci_xhci_insert_event(sc, &evtrb, 0); + if (err != XHCI_TRB_ERROR_SUCCESS) { + break; + } + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + } + + return (err); +} + +static void +pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, + uint32_t streamid, uint64_t ringaddr, int ccs) +{ + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | + (ccs & 0x1); + + devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; + devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; + ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring stream %d, addr %lx\r\n", + streamid, devep->ep_sctx[streamid].qwSctx0)); + } else { + devep->ep_ringaddr = ringaddr & ~0xFUL; + devep->ep_ccs = ccs & 0x1; + devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); + ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring, addr %lx\r\n", + (devep->ep_ringaddr | devep->ep_ccs))); + } +} + +/* + * Outstanding transfer still in progress (device NAK'd earlier) so retry + * the transfer again to see if it succeeds. + */ +static int +pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) +{ + struct usb_data_xfer *xfer; + int err; + int do_intr; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + err = 0; + do_intr = 0; + + xfer = devep->ep_xfer; +#ifdef __FreeBSD__ + USB_DATA_XFER_LOCK(xfer); +#else + /* + * At least one caller needs to hold this lock across the call to this + * function and other code. To avoid deadlock from a recursive mutex + * enter, we ensure that all callers hold this lock. + */ + assert(USB_DATA_XFER_LOCK_HELD(xfer)); +#endif + + /* outstanding requests queued up */ + if (dev->dev_ue->ue_data != NULL) { + err = dev->dev_ue->ue_data(dev->dev_sc, xfer, + epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); + if (err == USB_ERR_CANCELLED) { + if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == + USB_NAK) + err = XHCI_TRB_ERROR_SUCCESS; + } else { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, + &do_intr); + if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { + pci_xhci_assert_interrupt(sc); + } + + + /* XXX should not do it if error? */ + USB_DATA_XFER_RESET(xfer); + } + } + +#ifdef __FreeBSD__ + USB_DATA_XFER_UNLOCK(xfer); +#endif + + return (err); +} + + +static int +pci_xhci_handle_transfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, + uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) +{ + struct xhci_trb *setup_trb; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + uint64_t val; + uint32_t trbflags; + int do_intr, err; + int do_retry; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + xfer = devep->ep_xfer; + USB_DATA_XFER_LOCK(xfer); + + DPRINTF(("pci_xhci handle_transfer slot %u\r\n", slot)); + +retry: + err = 0; + do_retry = 0; + do_intr = 0; + setup_trb = NULL; + + while (1) { + pci_xhci_dump_trb(trb); + + trbflags = trb->dwTrb3; + + if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && + (trbflags & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) { + DPRINTF(("Cycle-bit changed trbflags %x, ccs %x\r\n", + trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); + break; + } + + xfer_block = NULL; + + switch (XHCI_TRB_3_TYPE_GET(trbflags)) { + case XHCI_TRB_TYPE_LINK: + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= 0x1; + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_SETUP_STAGE: + if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || + XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { + DPRINTF(("pci_xhci: invalid setup trb\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + setup_trb = trb; + + val = trb->qwTrb0; + if (!xfer->ureq) + xfer->ureq = malloc( + sizeof(struct usb_device_request)); + memcpy(xfer->ureq, &val, + sizeof(struct usb_device_request)); + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_NORMAL: + case XHCI_TRB_TYPE_ISOCH: + if (setup_trb != NULL) { + DPRINTF(("pci_xhci: trb not supposed to be in " + "ctl scope\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + /* fall through */ + + case XHCI_TRB_TYPE_DATA_STAGE: + xfer_block = usb_data_xfer_append(xfer, + (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? + &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), + trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_STATUS_STAGE: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_NOOP: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_EVENT_DATA: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { + xfer_block->processed = 1; + } + break; + + default: + DPRINTF(("pci_xhci: handle xfer unexpected trb type " + "0x%x\r\n", + XHCI_TRB_3_TYPE_GET(trbflags))); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + + trb = pci_xhci_trb_next(sc, trb, &addr); + + DPRINTF(("pci_xhci: next trb: 0x%lx\r\n", (uint64_t)trb)); + + if (xfer_block) { + xfer_block->trbnext = addr; + xfer_block->streamid = streamid; + } + + if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && + XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { + break; + } + + /* handle current batch that requires interrupt on complete */ + if (trbflags & XHCI_TRB_3_IOC_BIT) { + DPRINTF(("pci_xhci: trb IOC bit set\r\n")); + if (epid == 1) + do_retry = 1; + break; + } + } + + DPRINTF(("pci_xhci[%d]: xfer->ndata %u\r\n", __LINE__, xfer->ndata)); + + if (epid == 1) { + err = USB_ERR_NOT_STARTED; + if (dev->dev_ue->ue_request != NULL) + err = dev->dev_ue->ue_request(dev->dev_sc, xfer); + setup_trb = NULL; + } else { + /* handle data transfer */ + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); + err = XHCI_TRB_ERROR_SUCCESS; + goto errout; + } + + err = USB_TO_XHCI_ERR(err); + if ((err == XHCI_TRB_ERROR_SUCCESS) || + (err == XHCI_TRB_ERROR_SHORT_PKT)) { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); + if (err != XHCI_TRB_ERROR_SUCCESS) + do_retry = 0; + } + +errout: + if (err == XHCI_TRB_ERROR_EV_RING_FULL) + DPRINTF(("pci_xhci[%d]: event ring full\r\n", __LINE__)); + + if (!do_retry) + USB_DATA_XFER_UNLOCK(xfer); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + if (do_retry) { + USB_DATA_XFER_RESET(xfer); + DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs\r\n", + __LINE__)); + goto retry; + } + + if (epid == 1) + USB_DATA_XFER_RESET(xfer); + + return (err); +} + +static void +pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, + uint32_t epid, uint32_t streamid) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct pci_xhci_trb_ring *sctx_tr; + struct xhci_trb *trb; + uint64_t ringaddr; + uint32_t ccs; + + DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u\r\n", + slot, epid, streamid)); + + if (slot == 0 || slot > sc->ndevices) { + DPRINTF(("pci_xhci: invalid doorbell slot %u\r\n", slot)); + return; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + if (!dev_ctx) { + return; + } + ep_ctx = &dev_ctx->ctx_ep[epid]; + + sctx_tr = NULL; + + DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (ep_ctx->qwEpCtx2 == 0) + return; + + /* handle pending transfers */ + if (devep->ep_xfer->ndata > 0) { +#ifndef __FreeBSD__ + USB_DATA_XFER_LOCK(devep->ep_xfer); +#endif + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); +#ifndef __FreeBSD__ + USB_DATA_XFER_UNLOCK(devep->ep_xfer); +#endif + return; + } + + /* get next trb work item */ + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + sctx_tr = &devep->ep_sctx_trbs[streamid]; + ringaddr = sctx_tr->ringaddr; + ccs = sctx_tr->ccs; + trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); + DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x\r\n", + streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } else { + ringaddr = devep->ep_ringaddr; + ccs = devep->ep_ccs; + trb = devep->ep_tr; + DPRINTF(("doorbell, ccs %lx, trb ccs %x\r\n", + ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } + + if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { + DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?\r\n", + ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); + return; + } + + pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, + ringaddr, ccs, streamid); +} + +static void +pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + + offset = (offset - sc->dboff) / sizeof(uint32_t); + + DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + if (XHCI_HALTED(sc)) { + DPRINTF(("pci_xhci: controller halted\r\n")); + return; + } + + if (offset == 0) + pci_xhci_complete_commands(sc); + else if (sc->portregs != NULL) + pci_xhci_device_doorbell(sc, offset, + XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); +} + +static void +pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct pci_xhci_rtsregs *rts; + + offset -= sc->rtsoff; + + if (offset == 0) { + DPRINTF(("pci_xhci attempted write to MFINDEX\r\n")); + return; + } + + DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + offset -= 0x20; /* start of intrreg */ + + rts = &sc->rtsregs; + + switch (offset) { + case 0x00: + if (value & XHCI_IMAN_INTR_PEND) + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | + (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); + + if (!(value & XHCI_IMAN_INTR_ENA)) + pci_xhci_deassert_interrupt(sc); + + break; + + case 0x04: + rts->intrreg.imod = value; + break; + + case 0x08: + rts->intrreg.erstsz = value & 0xFFFF; + break; + + case 0x10: + /* ERSTBA low bits */ + rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | + (value & ~0x3F); + break; + + case 0x14: + /* ERSTBA high bits */ + rts->intrreg.erstba = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erstba); + + rts->erstba_p = XHCI_GADDR(sc, + sc->rtsregs.intrreg.erstba & ~0x3FUL); + + rts->erst_p = XHCI_GADDR(sc, + sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); + + rts->er_enq_idx = 0; + rts->er_events_cnt = 0; + + DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u\r\n", + rts->erstba_p, + rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize)); + break; + + case 0x18: + /* ERDP low bits */ + rts->intrreg.erdp = + MASK_64_HI(sc->rtsregs.intrreg.erdp) | + (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | + (value & ~0xF); + if (value & XHCI_ERDP_LO_BUSY) { + rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + } + + rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); + + break; + + case 0x1C: + /* ERDP high bits */ + rts->intrreg.erdp = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erdp); + + if (rts->er_events_cnt > 0) { + uint64_t erdp; + uint32_t erdp_i; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + if (erdp_i <= rts->er_enq_idx) + rts->er_events_cnt = rts->er_enq_idx - erdp_i; + else + rts->er_events_cnt = + rts->erstba_p->dwEvrsTableSize - + (erdp_i - rts->er_enq_idx); + + DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u\r\n", + erdp, rts->er_events_cnt)); + } + + break; + + default: + DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx\r\n", + offset)); + break; + } +} + +static uint64_t +pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + int port; + uint32_t *p; + + if (sc->portregs == NULL) + return (0); + + port = (offset - 0x3F0) / 0x10; + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS\r\n", + port)); + + /* return default value for unused port */ + return (XHCI_PS_SPEED_SET(3)); + } + + offset = (offset - 0x3F0) % 0x10; + + p = &sc->portregs[port].portsc; + p += offset / sizeof(uint32_t); + + DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x\r\n", + offset, port, *p)); + + return (*p); +} + +static void +pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + offset -= XHCI_CAPLEN; + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + switch (offset) { + case XHCI_USBCMD: + sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); + break; + + case XHCI_USBSTS: + /* clear bits on write */ + sc->opregs.usbsts &= ~(value & + (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| + XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); + break; + + case XHCI_PAGESIZE: + /* read only */ + break; + + case XHCI_DNCTRL: + sc->opregs.dnctrl = value & 0xFFFF; + break; + + case XHCI_CRCR_LO: + if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { + sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + sc->opregs.crcr |= value & + (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + } else { + sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | + (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); + } + break; + + case XHCI_CRCR_HI: + if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { + sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | + (value << 32); + + sc->opregs.cr_p = XHCI_GADDR(sc, + sc->opregs.crcr & ~0xF); + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { + /* Stop operation of Command Ring */ + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { + /* Abort command */ + } + + break; + + case XHCI_DCBAAP_LO: + sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | + (value & 0xFFFFFFC0); + break; + + case XHCI_DCBAAP_HI: + sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | + (value << 32); + sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); + + DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)\r\n", + sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); + break; + + case XHCI_CONFIG: + sc->opregs.config = value & 0x03FF; + break; + + default: + if (offset >= 0x400) + pci_xhci_portregs_write(sc, offset, value); + + break; + } +} + + +static void +pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_xhci_softc *sc; + + sc = pi->pi_arg; + + assert(baridx == 0); + + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) /* read only registers */ + WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset)); + else if (offset < sc->dboff) + pci_xhci_hostop_write(sc, offset, value); + else if (offset < sc->rtsoff) + pci_xhci_dbregs_write(sc, offset, value); + else if (offset < sc->regsend) + pci_xhci_rtsregs_write(sc, offset, value); + else + WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset)); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + switch (offset) { + case XHCI_CAPLENGTH: /* 0x00 */ + value = sc->caplength; + break; + + case XHCI_HCSPARAMS1: /* 0x04 */ + value = sc->hcsparams1; + break; + + case XHCI_HCSPARAMS2: /* 0x08 */ + value = sc->hcsparams2; + break; + + case XHCI_HCSPARAMS3: /* 0x0C */ + value = sc->hcsparams3; + break; + + case XHCI_HCSPARAMS0: /* 0x10 */ + value = sc->hccparams1; + break; + + case XHCI_DBOFF: /* 0x14 */ + value = sc->dboff; + break; + + case XHCI_RTSOFF: /* 0x18 */ + value = sc->rtsoff; + break; + + case XHCI_HCCPRAMS2: /* 0x1C */ + value = sc->hccparams2; + break; + + default: + value = 0; + break; + } + + DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + offset = (offset - XHCI_CAPLEN); + + switch (offset) { + case XHCI_USBCMD: /* 0x00 */ + value = sc->opregs.usbcmd; + break; + + case XHCI_USBSTS: /* 0x04 */ + value = sc->opregs.usbsts; + break; + + case XHCI_PAGESIZE: /* 0x08 */ + value = sc->opregs.pgsz; + break; + + case XHCI_DNCTRL: /* 0x14 */ + value = sc->opregs.dnctrl; + break; + + case XHCI_CRCR_LO: /* 0x18 */ + value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; + break; + + case XHCI_CRCR_HI: /* 0x1C */ + value = 0; + break; + + case XHCI_DCBAAP_LO: /* 0x30 */ + value = sc->opregs.dcbaap & 0xFFFFFFFF; + break; + + case XHCI_DCBAAP_HI: /* 0x34 */ + value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; + break; + + case XHCI_CONFIG: /* 0x38 */ + value = sc->opregs.config; + break; + + default: + if (offset >= 0x400) + value = pci_xhci_portregs_read(sc, offset); + else + value = 0; + + break; + } + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + + /* read doorbell always returns 0 */ + return (0); +} + +static uint64_t +pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->rtsoff; + value = 0; + + if (offset == XHCI_MFINDEX) { + value = sc->rtsregs.mfindex; + } else if (offset >= 0x20) { + int item; + uint32_t *p; + + offset -= 0x20; + item = offset % 32; + + assert(offset < sizeof(sc->rtsregs.intrreg)); + + p = &sc->rtsregs.intrreg.iman; + p += item / sizeof(uint32_t); + value = *p; + } + + DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->regsend; + value = 0; + + switch (offset) { + case 0: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; + break; + case 4: + /* name string = "USB" */ + value = 0x20425355; + break; + case 8: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; + break; + case 12: + break; + case 16: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x03 << 24) | XHCI_ID_PROTOCOLS; + break; + case 20: + /* name string = "USB" */ + value = 0x20425355; + break; + case 24: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; + break; + case 28: + break; + default: + DPRINTF(("pci_xhci: xecp invalid offset 0x%lx\r\n", offset)); + break; + } + + DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + + +static uint64_t +pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_xhci_softc *sc; + uint32_t value; + + sc = pi->pi_arg; + + assert(baridx == 0); + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) + value = pci_xhci_hostcap_read(sc, offset); + else if (offset < sc->dboff) + value = pci_xhci_hostop_read(sc, offset); + else if (offset < sc->rtsoff) + value = pci_xhci_dbregs_read(sc, offset); + else if (offset < sc->regsend) + value = pci_xhci_rtsregs_read(sc, offset); + else if (offset < (sc->regsend + 4*32)) + value = pci_xhci_xecp_read(sc, offset); + else { + value = 0; + WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset)); + } + + pthread_mutex_unlock(&sc->mtx); + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + return (value); +} + +static void +pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + struct xhci_trb evtrb; + int error; + + assert(portn <= XHCI_MAX_DEVS); + + DPRINTF(("xhci reset port %d\r\n", portn)); + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); + port->portsc |= XHCI_PS_PED | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + + if (warm && dev->dev_ue->ue_usbver == 3) { + port->portsc |= XHCI_PS_WRC; + } + + if ((port->portsc & XHCI_PS_PRC) == 0) { + port->portsc |= XHCI_PS_PRC; + + pci_xhci_set_evtrb(&evtrb, portn, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 1); + if (error != XHCI_TRB_ERROR_SUCCESS) + DPRINTF(("xhci reset port insert event " + "failed\r\n")); + } + } +} + +static void +pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc = XHCI_PS_CCS | /* connected */ + XHCI_PS_PP; /* port power */ + + if (dev->dev_ue->ue_usbver == 2) { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } else { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | + XHCI_PS_PED | /* enabled */ + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } + + DPRINTF(("Init port %d 0x%x\n", portn, port->portsc)); + } else { + port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; + DPRINTF(("Init empty port %d 0x%x\n", portn, port->portsc)); + } +} + +static int +pci_xhci_dev_intr(struct usb_hci *hci, int epctx) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_trb evtrb; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *p; + struct xhci_endp_ctx *ep_ctx; + int error; + int dir_in; + int epid; + + dir_in = epctx & 0x80; + epid = epctx & ~0x80; + + /* HW endpoint contexts are 0-15; convert to epid based on dir */ + epid = (epid * 2) + (dir_in ? 1 : 0); + + assert(epid >= 1 && epid <= 31); + + dev = hci->hci_sc; + sc = dev->xsc; + + /* check if device is ready; OS has to initialise it */ + if (sc->rtsregs.erstba_p == NULL || + (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || + dev->dev_ctx == NULL) + return (0); + + p = XHCI_PORTREG_PTR(sc, hci->hci_port); + + /* raise event if link U3 (suspended) state */ + if (XHCI_PS_PLS_GET(p->portsc) == 3) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); + if ((p->portsc & XHCI_PS_PLC) != 0) + return (0); + + p->portsc |= XHCI_PS_PLC; + + pci_xhci_set_evtrb(&evtrb, hci->hci_port, + XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 0); + if (error != XHCI_TRB_ERROR_SUCCESS) + goto done; + } + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { + DPRINTF(("xhci device interrupt on disabled endpoint %d\r\n", + epid)); + return (0); + } + + DPRINTF(("xhci device interrupt on endpoint %d\r\n", epid)); + + pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); + +done: + return (error); +} + +static int +pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param) +{ + + DPRINTF(("xhci device event port %d\r\n", hci->hci_port)); + return (0); +} + + + +static void +pci_xhci_device_usage(char *opt) +{ + + fprintf(stderr, "Invalid USB emulation \"%s\"\r\n", opt); +} + +static int +pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts) +{ + struct pci_xhci_dev_emu **devices; + struct pci_xhci_dev_emu *dev; + struct usb_devemu *ue; + void *devsc; +#ifdef __FreeBSD__ + char *uopt, *xopts, *config; +#else + char *uopt = NULL, *xopts, *config; +#endif + int usb3_port, usb2_port, i; + + usb3_port = sc->usb3_port_start - 1; + usb2_port = sc->usb2_port_start - 1; + devices = NULL; + + if (opts == NULL) + goto portsfinal; + + devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); + + sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); + sc->devices = devices; + sc->ndevices = 0; + + uopt = strdup(opts); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) || + usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) { + WPRINTF(("pci_xhci max number of USB 2 or 3 " + "devices reached, max %d\r\n", XHCI_MAX_DEVS/2)); + usb2_port = usb3_port = -1; + goto done; + } + + /* device[=<config>] */ + if ((config = strchr(xopts, '=')) == NULL) + config = ""; /* no config */ + else + *config++ = '\0'; + + ue = usb_emu_finddev(xopts); + if (ue == NULL) { + pci_xhci_device_usage(xopts); + DPRINTF(("pci_xhci device not found %s\r\n", xopts)); + usb2_port = usb3_port = -1; + goto done; + } + + DPRINTF(("pci_xhci adding device %s, opts \"%s\"\r\n", + xopts, config)); + + dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); + dev->xsc = sc; + dev->hci.hci_sc = dev; + dev->hci.hci_intr = pci_xhci_dev_intr; + dev->hci.hci_event = pci_xhci_dev_event; + + if (ue->ue_usbver == 2) { + dev->hci.hci_port = usb2_port + 1; + devices[usb2_port] = dev; + usb2_port++; + } else { + dev->hci.hci_port = usb3_port + 1; + devices[usb3_port] = dev; + usb3_port++; + } + + dev->hci.hci_address = 0; + devsc = ue->ue_init(&dev->hci, config); + if (devsc == NULL) { + pci_xhci_device_usage(xopts); + usb2_port = usb3_port = -1; + goto done; + } + + dev->dev_ue = ue; + dev->dev_sc = devsc; + + /* assign slot number to device */ + sc->slots[sc->ndevices] = dev; + + sc->ndevices++; + } +#ifdef __FreeBSD__ + if (uopt != NULL) + free(uopt); +#endif + +portsfinal: + sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); + + if (sc->ndevices > 0) { + /* port and slot numbering start from 1 */ + sc->devices--; + sc->portregs--; + sc->slots--; + + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + pci_xhci_init_port(sc, i); + } + } else { + WPRINTF(("pci_xhci no USB devices configured\r\n")); + sc->ndevices = 1; + } + +done: + if (devices != NULL) { + if (usb2_port <= 0 && usb3_port <= 0) { + sc->devices = NULL; + for (i = 0; devices[i] != NULL; i++) + free(devices[i]); + sc->ndevices = -1; + + free(devices); + } + } + free(uopt); + return (sc->ndevices); +} + +static int +pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_xhci_softc *sc; + int error; + + if (xhci_in_use) { + WPRINTF(("pci_xhci controller already defined\r\n")); + return (-1); + } + xhci_in_use = 1; + + sc = calloc(1, sizeof(struct pci_xhci_softc)); + pi->pi_arg = sc; + sc->xsc_pi = pi; + + sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; + sc->usb3_port_start = 1; + + /* discover devices */ + error = pci_xhci_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | + XHCI_SET_HCIVERSION(0x0100); + sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | + XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ + XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); + sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | + XHCI_SET_HCSP2_IST(0x04); + sc->hcsparams3 = 0; /* no latency */ + sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ + XHCI_SET_HCCP1_SPC(1) | /* short packet */ + XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); + sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | + XHCI_SET_HCCP2_U3C(1); + sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); + + /* dboff must be 32-bit aligned */ + if (sc->dboff & 0x3) + sc->dboff = (sc->dboff + 0x3) & ~0x3; + + /* rtsoff must be 32-bytes aligned */ + sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); + if (sc->rtsoff & 0x1F) + sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; + + DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x\r\n", sc->dboff, + sc->rtsoff)); + + sc->opregs.usbsts = XHCI_STS_HCH; + sc->opregs.pgsz = XHCI_PAGESIZE_4K; + + pci_xhci_reset(sc); + + sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ + + /* + * Set extended capabilities pointer to be after regsend; + * value of xecp field is 32-bit offset. + */ + sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); + pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); + pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); + + pci_emul_add_msicap(pi, 1); + + /* regsend + xecp registers */ + pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); + DPRINTF(("pci_xhci pci_emu_alloc: %d\r\n", sc->regsend + 4*32)); + + + pci_lintr_request(pi); + + pthread_mutex_init(&sc->mtx, NULL); + +done: + if (error) { + free(sc); + } + + return (error); +} + + + +struct pci_devemu pci_de_xhci = { + .pe_emu = "xhci", + .pe_init = pci_xhci_init, + .pe_barwrite = pci_xhci_write, + .pe_barread = pci_xhci_read +}; +PCI_EMUL_SET(pci_de_xhci); diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h new file mode 100644 index 0000000000..7502f9396a --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.h @@ -0,0 +1,355 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_XHCI_H_ +#define _PCI_XHCI_H_ + +#define PCI_USBREV 0x60 /* USB protocol revision */ + + +enum { /* dsc_slotstate */ + XHCI_ST_DISABLED, + XHCI_ST_ENABLED, + XHCI_ST_DEFAULT, + XHCI_ST_ADDRESSED, + XHCI_ST_CONFIGURED, + XHCI_ST_MAX +}; + +enum { + XHCI_ST_SLCTX_DISABLED, + XHCI_ST_SLCTX_DEFAULT, + XHCI_ST_SLCTX_ADDRESSED, + XHCI_ST_SLCTX_CONFIGURED +}; + +enum { + XHCI_ST_EPCTX_DISABLED, + XHCI_ST_EPCTX_RUNNING, + XHCI_ST_EPCTX_HALTED, + XHCI_ST_EPCTX_STOPPED, + XHCI_ST_EPCTX_ERROR +}; + +#define XHCI_MAX_DEVICES MIN(USB_MAX_DEVICES, 128) +#define XHCI_MAX_ENDPOINTS 32 /* hardcoded - do not change */ +#define XHCI_MAX_SCRATCHPADS 32 +#define XHCI_MAX_EVENTS (16 * 13) +#define XHCI_MAX_COMMANDS (16 * 1) +#define XHCI_MAX_RSEG 1 +#define XHCI_MAX_TRANSFERS 4 +#if USB_MAX_EP_STREAMS == 8 +#define XHCI_MAX_STREAMS 8 +#define XHCI_MAX_STREAMS_LOG 3 +#elif USB_MAX_EP_STREAMS == 1 +#define XHCI_MAX_STREAMS 1 +#define XHCI_MAX_STREAMS_LOG 0 +#else +#error "The USB_MAX_EP_STREAMS value is not supported." +#endif +#define XHCI_DEV_CTX_ADDR_ALIGN 64 /* bytes */ +#define XHCI_DEV_CTX_ALIGN 64 /* bytes */ +#define XHCI_INPUT_CTX_ALIGN 64 /* bytes */ +#define XHCI_SLOT_CTX_ALIGN 32 /* bytes */ +#define XHCI_ENDP_CTX_ALIGN 32 /* bytes */ +#define XHCI_STREAM_CTX_ALIGN 16 /* bytes */ +#define XHCI_TRANS_RING_SEG_ALIGN 16 /* bytes */ +#define XHCI_CMD_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_EVENT_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUF_ARRAY_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUFFER_ALIGN USB_PAGE_SIZE +#define XHCI_TRB_ALIGN 16 /* bytes */ +#define XHCI_TD_ALIGN 64 /* bytes */ +#define XHCI_PAGE_SIZE 4096 /* bytes */ + +struct xhci_slot_ctx { + volatile uint32_t dwSctx0; +#define XHCI_SCTX_0_ROUTE_SET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_ROUTE_GET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_SPEED_SET(x) (((x) & 0xF) << 20) +#define XHCI_SCTX_0_SPEED_GET(x) (((x) >> 20) & 0xF) +#define XHCI_SCTX_0_MTT_SET(x) (((x) & 0x1) << 25) +#define XHCI_SCTX_0_MTT_GET(x) (((x) >> 25) & 0x1) +#define XHCI_SCTX_0_HUB_SET(x) (((x) & 0x1) << 26) +#define XHCI_SCTX_0_HUB_GET(x) (((x) >> 26) & 0x1) +#define XHCI_SCTX_0_CTX_NUM_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_0_CTX_NUM_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx1; +#define XHCI_SCTX_1_MAX_EL_SET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_MAX_EL_GET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_RH_PORT_SET(x) (((x) & 0xFF) << 16) +#define XHCI_SCTX_1_RH_PORT_GET(x) (((x) >> 16) & 0xFF) +#define XHCI_SCTX_1_NUM_PORTS_SET(x) (((x) & 0xFF) << 24) +#define XHCI_SCTX_1_NUM_PORTS_GET(x) (((x) >> 24) & 0xFF) + volatile uint32_t dwSctx2; +#define XHCI_SCTX_2_TT_HUB_SID_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_HUB_SID_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_PORT_NUM_SET(x) (((x) & 0xFF) << 8) +#define XHCI_SCTX_2_TT_PORT_NUM_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_SCTX_2_TT_THINK_TIME_SET(x) (((x) & 0x3) << 16) +#define XHCI_SCTX_2_TT_THINK_TIME_GET(x) (((x) >> 16) & 0x3) +#define XHCI_SCTX_2_IRQ_TARGET_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_SCTX_2_IRQ_TARGET_GET(x) (((x) >> 22) & 0x3FF) + volatile uint32_t dwSctx3; +#define XHCI_SCTX_3_DEV_ADDR_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_DEV_ADDR_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_SLOT_STATE_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_3_SLOT_STATE_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx4; + volatile uint32_t dwSctx5; + volatile uint32_t dwSctx6; + volatile uint32_t dwSctx7; +}; + +struct xhci_endp_ctx { + volatile uint32_t dwEpCtx0; +#define XHCI_EPCTX_0_EPSTATE_SET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_EPSTATE_GET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_MULT_SET(x) (((x) & 0x3) << 8) +#define XHCI_EPCTX_0_MULT_GET(x) (((x) >> 8) & 0x3) +#define XHCI_EPCTX_0_MAXP_STREAMS_SET(x) (((x) & 0x1F) << 10) +#define XHCI_EPCTX_0_MAXP_STREAMS_GET(x) (((x) >> 10) & 0x1F) +#define XHCI_EPCTX_0_LSA_SET(x) (((x) & 0x1) << 15) +#define XHCI_EPCTX_0_LSA_GET(x) (((x) >> 15) & 0x1) +#define XHCI_EPCTX_0_IVAL_SET(x) (((x) & 0xFF) << 16) +#define XHCI_EPCTX_0_IVAL_GET(x) (((x) >> 16) & 0xFF) + volatile uint32_t dwEpCtx1; +#define XHCI_EPCTX_1_CERR_SET(x) (((x) & 0x3) << 1) +#define XHCI_EPCTX_1_CERR_GET(x) (((x) >> 1) & 0x3) +#define XHCI_EPCTX_1_EPTYPE_SET(x) (((x) & 0x7) << 3) +#define XHCI_EPCTX_1_EPTYPE_GET(x) (((x) >> 3) & 0x7) +#define XHCI_EPCTX_1_HID_SET(x) (((x) & 0x1) << 7) +#define XHCI_EPCTX_1_HID_GET(x) (((x) >> 7) & 0x1) +#define XHCI_EPCTX_1_MAXB_SET(x) (((x) & 0xFF) << 8) +#define XHCI_EPCTX_1_MAXB_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_EPCTX_1_MAXP_SIZE_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_1_MAXP_SIZE_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint64_t qwEpCtx2; +#define XHCI_EPCTX_2_DCS_SET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_DCS_GET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwEpCtx4; +#define XHCI_EPCTX_4_AVG_TRB_LEN_SET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_AVG_TRB_LEN_GET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint32_t dwEpCtx5; + volatile uint32_t dwEpCtx6; + volatile uint32_t dwEpCtx7; +}; + +struct xhci_input_ctx { +#define XHCI_INCTX_NON_CTRL_MASK 0xFFFFFFFCU + volatile uint32_t dwInCtx0; +#define XHCI_INCTX_0_DROP_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx1; +#define XHCI_INCTX_1_ADD_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx2; + volatile uint32_t dwInCtx3; + volatile uint32_t dwInCtx4; + volatile uint32_t dwInCtx5; + volatile uint32_t dwInCtx6; + volatile uint32_t dwInCtx7; +}; + +struct xhci_input_dev_ctx { + struct xhci_input_ctx ctx_input; + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +}; + +struct xhci_dev_ctx { + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +} __aligned(XHCI_DEV_CTX_ALIGN); +#define ctx_slot ctx_dev_slep.u_slot +#define ctx_ep ctx_dev_slep.u_ep + +struct xhci_stream_ctx { + volatile uint64_t qwSctx0; +#define XHCI_SCTX_0_DCS_GET(x) ((x) & 0x1) +#define XHCI_SCTX_0_DCS_SET(x) ((x) & 0x1) +#define XHCI_SCTX_0_SCT_SET(x) (((x) & 0x7) << 1) +#define XHCI_SCTX_0_SCT_GET(x) (((x) >> 1) & 0x7) +#define XHCI_SCTX_0_SCT_SEC_TR_RING 0x0 +#define XHCI_SCTX_0_SCT_PRIM_TR_RING 0x1 +#define XHCI_SCTX_0_SCT_PRIM_SSA_8 0x2 +#define XHCI_SCTX_0_SCT_PRIM_SSA_16 0x3 +#define XHCI_SCTX_0_SCT_PRIM_SSA_32 0x4 +#define XHCI_SCTX_0_SCT_PRIM_SSA_64 0x5 +#define XHCI_SCTX_0_SCT_PRIM_SSA_128 0x6 +#define XHCI_SCTX_0_SCT_PRIM_SSA_256 0x7 +#define XHCI_SCTX_0_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwSctx2; + volatile uint32_t dwSctx3; +}; + +struct xhci_trb { + volatile uint64_t qwTrb0; +#define XHCI_TRB_0_DIR_IN_MASK (0x80ULL << 0) +#define XHCI_TRB_0_WLENGTH_MASK (0xFFFFULL << 48) + volatile uint32_t dwTrb2; +#define XHCI_TRB_2_ERROR_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_2_ERROR_SET(x) (((x) & 0xFF) << 24) +#define XHCI_TRB_2_TDSZ_GET(x) (((x) >> 17) & 0x1F) +#define XHCI_TRB_2_TDSZ_SET(x) (((x) & 0x1F) << 17) +#define XHCI_TRB_2_REM_GET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_REM_SET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_BYTES_GET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_BYTES_SET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_IRQ_GET(x) (((x) >> 22) & 0x3FF) +#define XHCI_TRB_2_IRQ_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_TRB_2_STREAM_GET(x) (((x) >> 16) & 0xFFFF) +#define XHCI_TRB_2_STREAM_SET(x) (((x) & 0xFFFF) << 16) + + volatile uint32_t dwTrb3; +#define XHCI_TRB_3_TYPE_GET(x) (((x) >> 10) & 0x3F) +#define XHCI_TRB_3_TYPE_SET(x) (((x) & 0x3F) << 10) +#define XHCI_TRB_3_CYCLE_BIT (1U << 0) +#define XHCI_TRB_3_TC_BIT (1U << 1) /* command ring only */ +#define XHCI_TRB_3_ENT_BIT (1U << 1) /* transfer ring only */ +#define XHCI_TRB_3_ISP_BIT (1U << 2) +#define XHCI_TRB_3_ED_BIT (1U << 2) +#define XHCI_TRB_3_NSNOOP_BIT (1U << 3) +#define XHCI_TRB_3_CHAIN_BIT (1U << 4) +#define XHCI_TRB_3_IOC_BIT (1U << 5) +#define XHCI_TRB_3_IDT_BIT (1U << 6) +#define XHCI_TRB_3_TBC_GET(x) (((x) >> 7) & 3) +#define XHCI_TRB_3_TBC_SET(x) (((x) & 3) << 7) +#define XHCI_TRB_3_BEI_BIT (1U << 9) +#define XHCI_TRB_3_DCEP_BIT (1U << 9) +#define XHCI_TRB_3_PRSV_BIT (1U << 9) +#define XHCI_TRB_3_BSR_BIT (1U << 9) +#define XHCI_TRB_3_TRT_MASK (3U << 16) +#define XHCI_TRB_3_TRT_NONE (0U << 16) +#define XHCI_TRB_3_TRT_OUT (2U << 16) +#define XHCI_TRB_3_TRT_IN (3U << 16) +#define XHCI_TRB_3_DIR_IN (1U << 16) +#define XHCI_TRB_3_TLBPC_GET(x) (((x) >> 16) & 0xF) +#define XHCI_TRB_3_TLBPC_SET(x) (((x) & 0xF) << 16) +#define XHCI_TRB_3_EP_GET(x) (((x) >> 16) & 0x1F) +#define XHCI_TRB_3_EP_SET(x) (((x) & 0x1F) << 16) +#define XHCI_TRB_3_FRID_GET(x) (((x) >> 20) & 0x7FF) +#define XHCI_TRB_3_FRID_SET(x) (((x) & 0x7FF) << 20) +#define XHCI_TRB_3_ISO_SIA_BIT (1U << 31) +#define XHCI_TRB_3_SUSP_EP_BIT (1U << 23) +#define XHCI_TRB_3_SLOT_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_3_SLOT_SET(x) (((x) & 0xFF) << 24) + +/* Commands */ +#define XHCI_TRB_TYPE_RESERVED 0x00 +#define XHCI_TRB_TYPE_NORMAL 0x01 +#define XHCI_TRB_TYPE_SETUP_STAGE 0x02 +#define XHCI_TRB_TYPE_DATA_STAGE 0x03 +#define XHCI_TRB_TYPE_STATUS_STAGE 0x04 +#define XHCI_TRB_TYPE_ISOCH 0x05 +#define XHCI_TRB_TYPE_LINK 0x06 +#define XHCI_TRB_TYPE_EVENT_DATA 0x07 +#define XHCI_TRB_TYPE_NOOP 0x08 +#define XHCI_TRB_TYPE_ENABLE_SLOT 0x09 +#define XHCI_TRB_TYPE_DISABLE_SLOT 0x0A +#define XHCI_TRB_TYPE_ADDRESS_DEVICE 0x0B +#define XHCI_TRB_TYPE_CONFIGURE_EP 0x0C +#define XHCI_TRB_TYPE_EVALUATE_CTX 0x0D +#define XHCI_TRB_TYPE_RESET_EP 0x0E +#define XHCI_TRB_TYPE_STOP_EP 0x0F +#define XHCI_TRB_TYPE_SET_TR_DEQUEUE 0x10 +#define XHCI_TRB_TYPE_RESET_DEVICE 0x11 +#define XHCI_TRB_TYPE_FORCE_EVENT 0x12 +#define XHCI_TRB_TYPE_NEGOTIATE_BW 0x13 +#define XHCI_TRB_TYPE_SET_LATENCY_TOL 0x14 +#define XHCI_TRB_TYPE_GET_PORT_BW 0x15 +#define XHCI_TRB_TYPE_FORCE_HEADER 0x16 +#define XHCI_TRB_TYPE_NOOP_CMD 0x17 + +/* Events */ +#define XHCI_TRB_EVENT_TRANSFER 0x20 +#define XHCI_TRB_EVENT_CMD_COMPLETE 0x21 +#define XHCI_TRB_EVENT_PORT_STS_CHANGE 0x22 +#define XHCI_TRB_EVENT_BW_REQUEST 0x23 +#define XHCI_TRB_EVENT_DOORBELL 0x24 +#define XHCI_TRB_EVENT_HOST_CTRL 0x25 +#define XHCI_TRB_EVENT_DEVICE_NOTIFY 0x26 +#define XHCI_TRB_EVENT_MFINDEX_WRAP 0x27 + +/* Error codes */ +#define XHCI_TRB_ERROR_INVALID 0x00 +#define XHCI_TRB_ERROR_SUCCESS 0x01 +#define XHCI_TRB_ERROR_DATA_BUF 0x02 +#define XHCI_TRB_ERROR_BABBLE 0x03 +#define XHCI_TRB_ERROR_XACT 0x04 +#define XHCI_TRB_ERROR_TRB 0x05 +#define XHCI_TRB_ERROR_STALL 0x06 +#define XHCI_TRB_ERROR_RESOURCE 0x07 +#define XHCI_TRB_ERROR_BANDWIDTH 0x08 +#define XHCI_TRB_ERROR_NO_SLOTS 0x09 +#define XHCI_TRB_ERROR_STREAM_TYPE 0x0A +#define XHCI_TRB_ERROR_SLOT_NOT_ON 0x0B +#define XHCI_TRB_ERROR_ENDP_NOT_ON 0x0C +#define XHCI_TRB_ERROR_SHORT_PKT 0x0D +#define XHCI_TRB_ERROR_RING_UNDERRUN 0x0E +#define XHCI_TRB_ERROR_RING_OVERRUN 0x0F +#define XHCI_TRB_ERROR_VF_RING_FULL 0x10 +#define XHCI_TRB_ERROR_PARAMETER 0x11 +#define XHCI_TRB_ERROR_BW_OVERRUN 0x12 +#define XHCI_TRB_ERROR_CONTEXT_STATE 0x13 +#define XHCI_TRB_ERROR_NO_PING_RESP 0x14 +#define XHCI_TRB_ERROR_EV_RING_FULL 0x15 +#define XHCI_TRB_ERROR_INCOMPAT_DEV 0x16 +#define XHCI_TRB_ERROR_MISSED_SERVICE 0x17 +#define XHCI_TRB_ERROR_CMD_RING_STOP 0x18 +#define XHCI_TRB_ERROR_CMD_ABORTED 0x19 +#define XHCI_TRB_ERROR_STOPPED 0x1A +#define XHCI_TRB_ERROR_LENGTH 0x1B +#define XHCI_TRB_ERROR_BAD_MELAT 0x1D +#define XHCI_TRB_ERROR_ISOC_OVERRUN 0x1F +#define XHCI_TRB_ERROR_EVENT_LOST 0x20 +#define XHCI_TRB_ERROR_UNDEFINED 0x21 +#define XHCI_TRB_ERROR_INVALID_SID 0x22 +#define XHCI_TRB_ERROR_SEC_BW 0x23 +#define XHCI_TRB_ERROR_SPLIT_XACT 0x24 +} __aligned(4); + +struct xhci_dev_endpoint_trbs { + struct xhci_trb trb[(XHCI_MAX_STREAMS * + XHCI_MAX_TRANSFERS) + XHCI_MAX_STREAMS]; +}; + +struct xhci_event_ring_seg { + volatile uint64_t qwEvrsTablePtr; + volatile uint32_t dwEvrsTableSize; + volatile uint32_t dwEvrsReserved; +}; + +#endif /* _PCI_XHCI_H_ */ diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c new file mode 100644 index 0000000000..be188b79f2 --- /dev/null +++ b/usr/src/cmd/bhyve/pm.c @@ -0,0 +1,378 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Hudson River Trading LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <machine/vmm.h> + +#include <assert.h> +#include <errno.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <stdlib.h> +#endif +#include <signal.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "pci_irq.h" +#include "pci_lpc.h" + +static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; +#ifdef __FreeBSD__ +static struct mevent *power_button; +static sig_t old_power_handler; +#else +struct vmctx *pwr_ctx; +#endif + +/* + * Reset Control register at I/O port 0xcf9. Bit 2 forces a system + * reset when it transitions from 0 to 1. Bit 1 selects the type of + * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard" + * reset. + */ +static int +reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int error; + + static uint8_t reset_control; + + if (bytes != 1) + return (-1); + if (in) + *eax = reset_control; + else { + reset_control = *eax; + + /* Treat hard and soft resets the same. */ + if (reset_control & 0x4) { + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + } + } + return (0); +} +INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler); + +/* + * ACPI's SCI is a level-triggered interrupt. + */ +static int sci_active; + +static void +sci_assert(struct vmctx *ctx) +{ + + if (sci_active) + return; + vm_isa_assert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 1; +} + +static void +sci_deassert(struct vmctx *ctx) +{ + + if (!sci_active) + return; + vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 0; +} + +/* + * Power Management 1 Event Registers + * + * The only power management event supported is a power button upon + * receiving SIGTERM. + */ +static uint16_t pm1_enable, pm1_status; + +#define PM1_TMR_STS 0x0001 +#define PM1_BM_STS 0x0010 +#define PM1_GBL_STS 0x0020 +#define PM1_PWRBTN_STS 0x0100 +#define PM1_SLPBTN_STS 0x0200 +#define PM1_RTC_STS 0x0400 +#define PM1_WAK_STS 0x8000 + +#define PM1_TMR_EN 0x0001 +#define PM1_GBL_EN 0x0020 +#define PM1_PWRBTN_EN 0x0100 +#define PM1_SLPBTN_EN 0x0200 +#define PM1_RTC_EN 0x0400 + +static void +sci_update(struct vmctx *ctx) +{ + int need_sci; + + /* See if the SCI should be active or not. */ + need_sci = 0; + if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS)) + need_sci = 1; + if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS)) + need_sci = 1; + if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS)) + need_sci = 1; + if (need_sci) + sci_assert(ctx); + else + sci_deassert(ctx); +} + +static int +pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_status; + else { + /* + * Writes are only permitted to clear certain bits by + * writing 1 to those flags. + */ + pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS | + PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS)); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} + +static int +pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_enable; + else { + /* + * Only permit certain bits to be set. We never use + * the global lock, but ACPI-CA whines profusely if it + * can't set GBL_EN. + */ + pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler); +INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler); + +#ifdef __FreeBSD__ +static void +power_button_handler(int signal, enum ev_type type, void *arg) +{ + struct vmctx *ctx; + + ctx = arg; + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); +} + +#else +/* + * Initiate graceful power off. + */ +/*ARGSUSED*/ +static void +power_button_handler(int signal, siginfo_t *type, void *cp) +{ + /* + * In theory, taking the 'pm_lock' mutex from within this signal + * handler could lead to deadlock if the main thread already held this + * mutex. In reality, this mutex is local to this file and all of the + * other usage in this file only occurs in functions which are FreeBSD + * specific (and thus currently not used). Thus, for consistency with + * the other code in this file, we take the mutex, but in the future, + * if these other functions are ever enabled for use on non-FreeBSD + * systems and these functions could be called directly by a thread + * (which would then hold the mutex), then we need to revisit the use + * of this mutex in this signal handler. + */ + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(pwr_ctx); + } + pthread_mutex_unlock(&pm_lock); +} +#endif + +/* + * Power Management 1 Control Register + * + * This is mostly unimplemented except that we wish to handle writes that + * set SPL_EN to handle S5 (soft power off). + */ +static uint16_t pm1_control; + +#define PM1_SCI_EN 0x0001 +#define PM1_SLP_TYP 0x1c00 +#define PM1_SLP_EN 0x2000 +#define PM1_ALWAYS_ZERO 0xc003 + +static int +pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int error; + + if (bytes != 2) + return (-1); + if (in) + *eax = pm1_control; + else { + /* + * Various bits are write-only or reserved, so force them + * to zero in pm1_control. Always preserve SCI_EN as OSPM + * can never change it. + */ + pm1_control = (pm1_control & PM1_SCI_EN) | + (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO)); + + /* + * If SLP_EN is set, check for S5. Bhyve's _S5_ method + * says that '5' should be stored in SLP_TYP for S5. + */ + if (*eax & PM1_SLP_EN) { + if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + assert(error == 0 || errno == EALREADY); + } + } + } + return (0); +} +INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler); +#ifdef __FreeBSD__ +SYSRES_IO(PM1A_EVT_ADDR, 8); +#endif + +/* + * ACPI SMI Command Register + * + * This write-only register is used to enable and disable ACPI. + */ +static int +smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + assert(!in); + if (bytes != 1) + return (-1); + + pthread_mutex_lock(&pm_lock); + switch (*eax) { + case BHYVE_ACPI_ENABLE: + pm1_control |= PM1_SCI_EN; +#ifdef __FreeBSD__ + if (power_button == NULL) { + power_button = mevent_add(SIGTERM, EVF_SIGNAL, + power_button_handler, ctx); + old_power_handler = signal(SIGTERM, SIG_IGN); + } +#endif + break; + case BHYVE_ACPI_DISABLE: + pm1_control &= ~PM1_SCI_EN; +#ifdef __FreeBSD__ + if (power_button != NULL) { + mevent_delete(power_button); + power_button = NULL; + signal(SIGTERM, old_power_handler); + } +#endif + break; + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler); +#ifdef __FreeBSD__ +SYSRES_IO(SMI_CMD, 1); +#endif + +void +sci_init(struct vmctx *ctx) +{ + + /* + * Mark ACPI's SCI as level trigger and bump its use count + * in the PIRQ router. + */ + pci_irq_use(SCI_INT); + vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER); + +#ifndef __FreeBSD__ + { + /* + * Install SIGTERM signal handler for graceful power off. + */ + struct sigaction act; + + pwr_ctx = ctx; + act.sa_flags = 0; + act.sa_sigaction = power_button_handler; + (void) sigaction(SIGTERM, &act, NULL); + } +#endif +} diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c new file mode 100644 index 0000000000..d3040a8df7 --- /dev/null +++ b/usr/src/cmd/bhyve/post.c @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> + +#include "inout.h" +#include "pci_lpc.h" + +static int +post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); +SYSRES_IO(0x84, 1); diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c new file mode 100644 index 0000000000..5453a26949 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2kbd.c @@ -0,0 +1,383 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "atkbdc.h" +#include "console.h" + +/* keyboard device commands */ +#define PS2KC_RESET_DEV 0xff +#define PS2KC_DISABLE 0xf5 +#define PS2KC_ENABLE 0xf4 +#define PS2KC_SET_TYPEMATIC 0xf3 +#define PS2KC_SEND_DEV_ID 0xf2 +#define PS2KC_SET_SCANCODE_SET 0xf0 +#define PS2KC_ECHO 0xee +#define PS2KC_SET_LEDS 0xed + +#define PS2KC_BAT_SUCCESS 0xaa +#define PS2KC_ACK 0xfa + +#define PS2KBD_FIFOSZ 16 + +struct fifo { + uint8_t buf[PS2KBD_FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of bytes in the fifo */ + int size; /* size of the fifo */ +}; + +struct ps2kbd_softc { + struct atkbdc_softc *atkbdc_sc; + pthread_mutex_t mtx; + + bool enabled; + struct fifo fifo; + + uint8_t curcmd; /* current command for next byte */ +}; + +#define SCANCODE_E0_PREFIX 1 +struct extended_translation { + uint32_t keysym; + uint8_t scancode; + int flags; +}; + +/* + * FIXME: Pause/break and Print Screen/SysRq require special handling. + */ +static const struct extended_translation extended_translations[] = { + {0xff08, 0x66}, /* Back space */ + {0xff09, 0x0d}, /* Tab */ + {0xff0d, 0x5a}, /* Return */ + {0xff1b, 0x76}, /* Escape */ + {0xff50, 0x6c, SCANCODE_E0_PREFIX}, /* Home */ + {0xff51, 0x6b, SCANCODE_E0_PREFIX}, /* Left arrow */ + {0xff52, 0x75, SCANCODE_E0_PREFIX}, /* Up arrow */ + {0xff53, 0x74, SCANCODE_E0_PREFIX}, /* Right arrow */ + {0xff54, 0x72, SCANCODE_E0_PREFIX}, /* Down arrow */ + {0xff55, 0x7d, SCANCODE_E0_PREFIX}, /* PgUp */ + {0xff56, 0x7a, SCANCODE_E0_PREFIX}, /* PgDown */ + {0xff57, 0x69, SCANCODE_E0_PREFIX}, /* End */ + {0xff63, 0x70, SCANCODE_E0_PREFIX}, /* Ins */ + {0xff8d, 0x5a, SCANCODE_E0_PREFIX}, /* Keypad Enter */ + {0xffe1, 0x12}, /* Left shift */ + {0xffe2, 0x59}, /* Right shift */ + {0xffe3, 0x14}, /* Left control */ + {0xffe4, 0x14, SCANCODE_E0_PREFIX}, /* Right control */ + /* {0xffe7, XXX}, Left meta */ + /* {0xffe8, XXX}, Right meta */ + {0xffe9, 0x11}, /* Left alt */ + {0xfe03, 0x11, SCANCODE_E0_PREFIX}, /* AltGr */ + {0xffea, 0x11, SCANCODE_E0_PREFIX}, /* Right alt */ + {0xffeb, 0x1f, SCANCODE_E0_PREFIX}, /* Left Windows */ + {0xffec, 0x27, SCANCODE_E0_PREFIX}, /* Right Windows */ + {0xffbe, 0x05}, /* F1 */ + {0xffbf, 0x06}, /* F2 */ + {0xffc0, 0x04}, /* F3 */ + {0xffc1, 0x0c}, /* F4 */ + {0xffc2, 0x03}, /* F5 */ + {0xffc3, 0x0b}, /* F6 */ + {0xffc4, 0x83}, /* F7 */ + {0xffc5, 0x0a}, /* F8 */ + {0xffc6, 0x01}, /* F9 */ + {0xffc7, 0x09}, /* F10 */ + {0xffc8, 0x78}, /* F11 */ + {0xffc9, 0x07}, /* F12 */ + {0xffff, 0x71, SCANCODE_E0_PREFIX}, /* Del */ + {0xff14, 0x7e}, /* ScrollLock */ + /* NumLock and Keypads*/ + {0xff7f, 0x77}, /* NumLock */ + {0xffaf, 0x4a, SCANCODE_E0_PREFIX}, /* Keypad slash */ + {0xffaa, 0x7c}, /* Keypad asterisk */ + {0xffad, 0x7b}, /* Keypad minus */ + {0xffab, 0x79}, /* Keypad plus */ + {0xffb7, 0x6c}, /* Keypad 7 */ + {0xff95, 0x6c}, /* Keypad home */ + {0xffb8, 0x75}, /* Keypad 8 */ + {0xff97, 0x75}, /* Keypad up arrow */ + {0xffb9, 0x7d}, /* Keypad 9 */ + {0xff9a, 0x7d}, /* Keypad PgUp */ + {0xffb4, 0x6b}, /* Keypad 4 */ + {0xff96, 0x6b}, /* Keypad left arrow */ + {0xffb5, 0x73}, /* Keypad 5 */ + {0xff9d, 0x73}, /* Keypad empty */ + {0xffb6, 0x74}, /* Keypad 6 */ + {0xff98, 0x74}, /* Keypad right arrow */ + {0xffb1, 0x69}, /* Keypad 1 */ + {0xff9c, 0x69}, /* Keypad end */ + {0xffb2, 0x72}, /* Keypad 2 */ + {0xff99, 0x72}, /* Keypad down arrow */ + {0xffb3, 0x7a}, /* Keypad 3 */ + {0xff9b, 0x7a}, /* Keypad PgDown */ + {0xffb0, 0x70}, /* Keypad 0 */ + {0xff9e, 0x70}, /* Keypad ins */ + {0xffae, 0x71}, /* Keypad . */ + {0xff9f, 0x71}, /* Keypad del */ + {0, 0, 0} /* Terminator */ +}; + +/* ASCII to type 2 scancode lookup table */ +static const uint8_t ascii_translations[128] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, + 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, + 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, + 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, + 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, + 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, +}; + +static void +fifo_init(struct ps2kbd_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_reset(struct ps2kbd_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_put(struct ps2kbd_softc *sc, uint8_t val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = val; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + } +} + +static int +fifo_get(struct ps2kbd_softc *sc, uint8_t *val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num > 0) { + *val = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + return (0); + } + + return (-1); +} + +int +ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val) +{ + int retval; + + pthread_mutex_lock(&sc->mtx); + retval = fifo_get(sc, val); + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +void +ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val) +{ + pthread_mutex_lock(&sc->mtx); + if (sc->curcmd) { + switch (sc->curcmd) { + case PS2KC_SET_TYPEMATIC: + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_SCANCODE_SET: + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_LEDS: + fifo_put(sc, PS2KC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 keyboard current " + "command byte 0x%02x\n", val); + break; + } + sc->curcmd = 0; + } else { + switch (val) { + case 0x00: + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_RESET_DEV: + fifo_reset(sc); + fifo_put(sc, PS2KC_ACK); + fifo_put(sc, PS2KC_BAT_SUCCESS); + break; + case PS2KC_DISABLE: + sc->enabled = false; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_ENABLE: + sc->enabled = true; + fifo_reset(sc); + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_TYPEMATIC: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SEND_DEV_ID: + fifo_put(sc, PS2KC_ACK); + fifo_put(sc, 0xab); + fifo_put(sc, 0x83); + break; + case PS2KC_SET_SCANCODE_SET: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_ECHO: + fifo_put(sc, PS2KC_ECHO); + break; + case PS2KC_SET_LEDS: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 keyboard command " + "0x%02x\n", val); + break; + } + } + pthread_mutex_unlock(&sc->mtx); +} + +/* + * Translate keysym to type 2 scancode and insert into keyboard buffer. + */ +static void +ps2kbd_keysym_queue(struct ps2kbd_softc *sc, + int down, uint32_t keysym) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + int e0_prefix, found; + uint8_t code; + const struct extended_translation *trans; + + found = 0; + if (keysym < 0x80) { + code = ascii_translations[keysym]; + e0_prefix = 0; + found = 1; + } else { + for (trans = &(extended_translations[0]); trans->keysym != 0; + trans++) { + if (keysym == trans->keysym) { + code = trans->scancode; + e0_prefix = trans->flags & SCANCODE_E0_PREFIX; + found = 1; + break; + } + } + } + + if (!found) { + fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", keysym); + return; + } + + if (e0_prefix) + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, code); +} + +static void +ps2kbd_event(int down, uint32_t keysym, void *arg) +{ + struct ps2kbd_softc *sc = arg; + int fifo_full; + + pthread_mutex_lock(&sc->mtx); + if (!sc->enabled) { + pthread_mutex_unlock(&sc->mtx); + return; + } + fifo_full = sc->fifo.num == PS2KBD_FIFOSZ; + ps2kbd_keysym_queue(sc, down, keysym); + pthread_mutex_unlock(&sc->mtx); + + if (!fifo_full) + atkbdc_event(sc->atkbdc_sc, 1); +} + +struct ps2kbd_softc * +ps2kbd_init(struct atkbdc_softc *atkbdc_sc) +{ + struct ps2kbd_softc *sc; + + sc = calloc(1, sizeof (struct ps2kbd_softc)); + pthread_mutex_init(&sc->mtx, NULL); + fifo_init(sc); + sc->atkbdc_sc = atkbdc_sc; + + console_kbd_register(ps2kbd_event, sc, 1); + + return (sc); +} + diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h new file mode 100644 index 0000000000..17be6d0466 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2kbd.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PS2KBD_H_ +#define _PS2KBD_H_ + +struct atkbdc_softc; + +struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc); + +int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val); +void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val); + +#endif /* _PS2KBD_H_ */ diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c new file mode 100644 index 0000000000..b2e08262b1 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2mouse.c @@ -0,0 +1,418 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "atkbdc.h" +#include "console.h" + +/* mouse device commands */ +#define PS2MC_RESET_DEV 0xff +#define PS2MC_SET_DEFAULTS 0xf6 +#define PS2MC_DISABLE 0xf5 +#define PS2MC_ENABLE 0xf4 +#define PS2MC_SET_SAMPLING_RATE 0xf3 +#define PS2MC_SEND_DEV_ID 0xf2 +#define PS2MC_SET_REMOTE_MODE 0xf0 +#define PS2MC_SEND_DEV_DATA 0xeb +#define PS2MC_SET_STREAM_MODE 0xea +#define PS2MC_SEND_DEV_STATUS 0xe9 +#define PS2MC_SET_RESOLUTION 0xe8 +#define PS2MC_SET_SCALING1 0xe7 +#define PS2MC_SET_SCALING2 0xe6 + +#define PS2MC_BAT_SUCCESS 0xaa +#define PS2MC_ACK 0xfa + +/* mouse device id */ +#define PS2MOUSE_DEV_ID 0x0 + +/* mouse data bits */ +#define PS2M_DATA_Y_OFLOW 0x80 +#define PS2M_DATA_X_OFLOW 0x40 +#define PS2M_DATA_Y_SIGN 0x20 +#define PS2M_DATA_X_SIGN 0x10 +#define PS2M_DATA_AONE 0x08 +#define PS2M_DATA_MID_BUTTON 0x04 +#define PS2M_DATA_RIGHT_BUTTON 0x02 +#define PS2M_DATA_LEFT_BUTTON 0x01 + +/* mouse status bits */ +#define PS2M_STS_REMOTE_MODE 0x40 +#define PS2M_STS_ENABLE_DEV 0x20 +#define PS2M_STS_SCALING_21 0x10 +#define PS2M_STS_MID_BUTTON 0x04 +#define PS2M_STS_RIGHT_BUTTON 0x02 +#define PS2M_STS_LEFT_BUTTON 0x01 + +#define PS2MOUSE_FIFOSZ 16 + +struct fifo { + uint8_t buf[PS2MOUSE_FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of bytes in the fifo */ + int size; /* size of the fifo */ +}; + +struct ps2mouse_softc { + struct atkbdc_softc *atkbdc_sc; + pthread_mutex_t mtx; + + uint8_t status; + uint8_t resolution; + uint8_t sampling_rate; + int ctrlenable; + struct fifo fifo; + + uint8_t curcmd; /* current command for next byte */ + + int cur_x, cur_y; + int delta_x, delta_y; +}; + +static void +fifo_init(struct ps2mouse_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_reset(struct ps2mouse_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_put(struct ps2mouse_softc *sc, uint8_t val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = val; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + } +} + +static int +fifo_get(struct ps2mouse_softc *sc, uint8_t *val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num > 0) { + *val = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + return (0); + } + + return (-1); +} + +static void +movement_reset(struct ps2mouse_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + sc->delta_x = 0; + sc->delta_y = 0; +} + +static void +movement_update(struct ps2mouse_softc *sc, int x, int y) +{ + sc->delta_x += x - sc->cur_x; + sc->delta_y += sc->cur_y - y; + sc->cur_x = x; + sc->cur_y = y; +} + +static void +movement_get(struct ps2mouse_softc *sc) +{ + uint8_t val0, val1, val2; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + val0 = PS2M_DATA_AONE; + val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON | + PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON); + + if (sc->delta_x >= 0) { + if (sc->delta_x > 255) { + val0 |= PS2M_DATA_X_OFLOW; + val1 = 255; + } else + val1 = sc->delta_x; + } else { + val0 |= PS2M_DATA_X_SIGN; + if (sc->delta_x < -255) { + val0 |= PS2M_DATA_X_OFLOW; + val1 = 255; + } else + val1 = sc->delta_x; + } + sc->delta_x = 0; + + if (sc->delta_y >= 0) { + if (sc->delta_y > 255) { + val0 |= PS2M_DATA_Y_OFLOW; + val2 = 255; + } else + val2 = sc->delta_y; + } else { + val0 |= PS2M_DATA_Y_SIGN; + if (sc->delta_y < -255) { + val0 |= PS2M_DATA_Y_OFLOW; + val2 = 255; + } else + val2 = sc->delta_y; + } + sc->delta_y = 0; + + if (sc->fifo.num < (sc->fifo.size - 3)) { + fifo_put(sc, val0); + fifo_put(sc, val1); + fifo_put(sc, val2); + } +} + +static void +ps2mouse_reset(struct ps2mouse_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + fifo_reset(sc); + movement_reset(sc); + sc->status = PS2M_STS_ENABLE_DEV; + sc->resolution = 4; + sc->sampling_rate = 100; + + sc->cur_x = 0; + sc->cur_y = 0; + sc->delta_x = 0; + sc->delta_y = 0; +} + +int +ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val) +{ + int retval; + + pthread_mutex_lock(&sc->mtx); + retval = fifo_get(sc, val); + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +int +ps2mouse_fifocnt(struct ps2mouse_softc *sc) +{ + return (sc->fifo.num); +} + +void +ps2mouse_toggle(struct ps2mouse_softc *sc, int enable) +{ + pthread_mutex_lock(&sc->mtx); + if (enable) + sc->ctrlenable = 1; + else { + sc->ctrlenable = 0; + sc->fifo.rindex = 0; + sc->fifo.windex = 0; + sc->fifo.num = 0; + } + pthread_mutex_unlock(&sc->mtx); +} + +void +ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert) +{ + pthread_mutex_lock(&sc->mtx); + fifo_reset(sc); + if (sc->curcmd) { + switch (sc->curcmd) { + case PS2MC_SET_SAMPLING_RATE: + sc->sampling_rate = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_RESOLUTION: + sc->resolution = val; + fifo_put(sc, PS2MC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 mouse current " + "command byte 0x%02x\n", val); + break; + } + sc->curcmd = 0; + + } else if (insert) { + fifo_put(sc, val); + } else { + switch (val) { + case 0x00: + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_RESET_DEV: + ps2mouse_reset(sc); + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, PS2MC_BAT_SUCCESS); + fifo_put(sc, PS2MOUSE_DEV_ID); + break; + case PS2MC_SET_DEFAULTS: + ps2mouse_reset(sc); + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_DISABLE: + fifo_reset(sc); + sc->status &= ~PS2M_STS_ENABLE_DEV; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_ENABLE: + fifo_reset(sc); + sc->status |= PS2M_STS_ENABLE_DEV; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_SAMPLING_RATE: + sc->curcmd = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_ID: + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, PS2MOUSE_DEV_ID); + break; + case PS2MC_SET_REMOTE_MODE: + sc->status |= PS2M_STS_REMOTE_MODE; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_DATA: + fifo_put(sc, PS2MC_ACK); + movement_get(sc); + break; + case PS2MC_SET_STREAM_MODE: + sc->status &= ~PS2M_STS_REMOTE_MODE; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_STATUS: + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, sc->status); + fifo_put(sc, sc->resolution); + fifo_put(sc, sc->sampling_rate); + break; + case PS2MC_SET_RESOLUTION: + sc->curcmd = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_SCALING1: + case PS2MC_SET_SCALING2: + fifo_put(sc, PS2MC_ACK); + break; + default: + fifo_put(sc, PS2MC_ACK); + fprintf(stderr, "Unhandled ps2 mouse command " + "0x%02x\n", val); + break; + } + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +ps2mouse_event(uint8_t button, int x, int y, void *arg) +{ + struct ps2mouse_softc *sc = arg; + + pthread_mutex_lock(&sc->mtx); + movement_update(sc, x, y); + + sc->status &= ~(PS2M_STS_LEFT_BUTTON | + PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); + if (button & (1 << 0)) + sc->status |= PS2M_STS_LEFT_BUTTON; + if (button & (1 << 1)) + sc->status |= PS2M_STS_MID_BUTTON; + if (button & (1 << 2)) + sc->status |= PS2M_STS_RIGHT_BUTTON; + + if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) { + /* no data reporting */ + pthread_mutex_unlock(&sc->mtx); + return; + } + + movement_get(sc); + pthread_mutex_unlock(&sc->mtx); + + if (sc->fifo.num > 0) + atkbdc_event(sc->atkbdc_sc, 0); +} + +struct ps2mouse_softc * +ps2mouse_init(struct atkbdc_softc *atkbdc_sc) +{ + struct ps2mouse_softc *sc; + + sc = calloc(1, sizeof (struct ps2mouse_softc)); + pthread_mutex_init(&sc->mtx, NULL); + fifo_init(sc); + sc->atkbdc_sc = atkbdc_sc; + + pthread_mutex_lock(&sc->mtx); + ps2mouse_reset(sc); + pthread_mutex_unlock(&sc->mtx); + + console_ptr_register(ps2mouse_event, sc, 1); + + return (sc); +} + + diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h new file mode 100644 index 0000000000..59430b01e2 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2mouse.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PS2MOUSE_H_ +#define _PS2MOUSE_H_ + +struct atkbdc_softc; + +struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); + +int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val); +void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert); +void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); +int ps2mouse_fifocnt(struct ps2mouse_softc *sc); + +#endif /* _PS2MOUSE_H_ */ diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c new file mode 100644 index 0000000000..39ea1611f9 --- /dev/null +++ b/usr/src/cmd/bhyve/rfb.c @@ -0,0 +1,1148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Leon Dang + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/endian.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/time.h> +#include <arpa/inet.h> +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <netinet/in.h> +#include <netdb.h> + +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <pthread.h> +#include <pthread_np.h> +#include <signal.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <zlib.h> + +#ifndef __FreeBSD__ +#include <sys/debug.h> +#endif + +#include "bhyvegc.h" +#include "console.h" +#include "rfb.h" +#include "sockstream.h" + +#ifndef NO_OPENSSL +#include <openssl/des.h> +#endif + +static int rfb_debug = 0; +#define DPRINTF(params) if (rfb_debug) printf params +#define WPRINTF(params) printf params + +#define AUTH_LENGTH 16 +#define PASSWD_LENGTH 8 + +#define SECURITY_TYPE_NONE 1 +#define SECURITY_TYPE_VNC_AUTH 2 + +#define AUTH_FAILED_UNAUTH 1 +#define AUTH_FAILED_ERROR 2 + +struct rfb_softc { + int sfd; + pthread_t tid; + + int cfd; + + int width, height; + + char *password; + + bool enc_raw_ok; + bool enc_zlib_ok; + bool enc_resize_ok; + + z_stream zstream; + uint8_t *zbuf; + int zbuflen; + + int conn_wait; + int sending; + pthread_mutex_t mtx; + pthread_cond_t cond; + + int hw_crc; + uint32_t *crc; /* WxH crc cells */ + uint32_t *crc_tmp; /* buffer to store single crc row */ + int crc_width, crc_height; +}; + +struct rfb_pixfmt { + uint8_t bpp; + uint8_t depth; + uint8_t bigendian; + uint8_t truecolor; + uint16_t red_max; + uint16_t green_max; + uint16_t blue_max; + uint8_t red_shift; + uint8_t green_shift; + uint8_t blue_shift; + uint8_t pad[3]; +}; + +struct rfb_srvr_info { + uint16_t width; + uint16_t height; + struct rfb_pixfmt pixfmt; + uint32_t namelen; +}; + +struct rfb_pixfmt_msg { + uint8_t type; + uint8_t pad[3]; + struct rfb_pixfmt pixfmt; +}; + +#define RFB_ENCODING_RAW 0 +#define RFB_ENCODING_ZLIB 6 +#define RFB_ENCODING_RESIZE -223 + +#define RFB_MAX_WIDTH 2000 +#define RFB_MAX_HEIGHT 1200 +#define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4 + +/* percentage changes to screen before sending the entire screen */ +#define RFB_SEND_ALL_THRESH 25 + +struct rfb_enc_msg { + uint8_t type; + uint8_t pad; + uint16_t numencs; +}; + +struct rfb_updt_msg { + uint8_t type; + uint8_t incremental; + uint16_t x; + uint16_t y; + uint16_t width; + uint16_t height; +}; + +struct rfb_key_msg { + uint8_t type; + uint8_t down; + uint16_t pad; + uint32_t code; +}; + +struct rfb_ptr_msg { + uint8_t type; + uint8_t button; + uint16_t x; + uint16_t y; +}; + +struct rfb_srvr_updt_msg { + uint8_t type; + uint8_t pad; + uint16_t numrects; +}; + +struct rfb_srvr_rect_hdr { + uint16_t x; + uint16_t y; + uint16_t width; + uint16_t height; + uint32_t encoding; +}; + +struct rfb_cuttext_msg { + uint8_t type; + uint8_t padding[3]; + uint32_t length; +}; + + +static void +rfb_send_server_init_msg(int cfd) +{ + struct bhyvegc_image *gc_image; + struct rfb_srvr_info sinfo; + + gc_image = console_get_image(); + + sinfo.width = htons(gc_image->width); + sinfo.height = htons(gc_image->height); + sinfo.pixfmt.bpp = 32; + sinfo.pixfmt.depth = 32; + sinfo.pixfmt.bigendian = 0; + sinfo.pixfmt.truecolor = 1; + sinfo.pixfmt.red_max = htons(255); + sinfo.pixfmt.green_max = htons(255); + sinfo.pixfmt.blue_max = htons(255); + sinfo.pixfmt.red_shift = 16; + sinfo.pixfmt.green_shift = 8; + sinfo.pixfmt.blue_shift = 0; + sinfo.namelen = htonl(strlen("bhyve")); + (void)stream_write(cfd, &sinfo, sizeof(sinfo)); + (void)stream_write(cfd, "bhyve", strlen("bhyve")); +} + +static void +rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = htons(1); + stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + + /* Rectangle header */ + srect_hdr.x = htons(0); + srect_hdr.y = htons(0); + srect_hdr.width = htons(rc->width); + srect_hdr.height = htons(rc->height); + srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE); + stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); +} + +static void +rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_pixfmt_msg pixfmt_msg; + + (void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1); +} + + +static void +rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_enc_msg enc_msg; + int i; + uint32_t encoding; + + assert((sizeof(enc_msg) - 1) == 3); + (void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1); + + for (i = 0; i < htons(enc_msg.numencs); i++) { + (void)stream_read(cfd, &encoding, sizeof(encoding)); + switch (htonl(encoding)) { + case RFB_ENCODING_RAW: + rc->enc_raw_ok = true; + break; + case RFB_ENCODING_ZLIB: + rc->enc_zlib_ok = true; + deflateInit(&rc->zstream, Z_BEST_SPEED); + break; + case RFB_ENCODING_RESIZE: + rc->enc_resize_ok = true; + break; + } + } +} + +/* + * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only + */ +static __inline uint32_t +fast_crc32(void *buf, int len, uint32_t crcval) +{ + uint32_t q = len / sizeof(uint32_t); + uint32_t *p = (uint32_t *)buf; + + while (q--) { + asm volatile ( + ".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;" + :"=S" (crcval) + :"0" (crcval), "c" (*p) + ); + p++; + } + + return (crcval); +} + + +static int +rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc, + int x, int y, int w, int h) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + unsigned long zlen; + ssize_t nwrite, total; + int err; + uint32_t *p; + uint8_t *zbufp; + + /* + * Send a single rectangle of the given x, y, w h dimensions. + */ + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); + + + /* Rectangle header */ + srect_hdr.x = htons(x); + srect_hdr.y = htons(y); + srect_hdr.width = htons(w); + srect_hdr.height = htons(h); + + h = y + h; + w *= sizeof(uint32_t); + if (rc->enc_zlib_ok) { + zbufp = rc->zbuf; + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + rc->zstream.next_in = (Bytef *)p; + rc->zstream.avail_in = w; + rc->zstream.next_out = (Bytef *)zbufp; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 - + rc->zstream.total_out; + rc->zstream.data_type = Z_BINARY; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib[rect] deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + zbufp = rc->zbuf + rc->zstream.total_out; + p += gc->width; + } + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + + total = 0; + zbufp = rc->zbuf; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + memcpy(zbufp, p, w); + zbufp += w; + total += w; + p += gc->width; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + total = stream_write(cfd, rc->zbuf, total); + + return (total); +} + +static int +rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + ssize_t nwrite; + unsigned long zlen; + int err; + + /* + * Send the whole thing + */ + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); + + /* Rectangle header */ + srect_hdr.x = 0; + srect_hdr.y = 0; + srect_hdr.width = htons(gc->width); + srect_hdr.height = htons(gc->height); + if (rc->enc_zlib_ok) { + rc->zstream.next_in = (Bytef *)gc->data; + rc->zstream.avail_in = gc->width * gc->height * + sizeof(uint32_t); + rc->zstream.next_out = (Bytef *)rc->zbuf; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16; + rc->zstream.data_type = Z_BINARY; + + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + nwrite = stream_write(cfd, gc->data, + gc->width * gc->height * sizeof(uint32_t)); + + return (nwrite); +} + +#define PIX_PER_CELL 32 +#define PIXCELL_SHIFT 5 +#define PIXCELL_MASK 0x1F + +static int +rfb_send_screen(struct rfb_softc *rc, int cfd, int all) +{ + struct bhyvegc_image *gc_image; + ssize_t nwrite; + int x, y; + int celly, cellwidth; + int xcells, ycells; + int w, h; + uint32_t *p; + int rem_x, rem_y; /* remainder for resolutions not x32 pixels ratio */ + int retval; + uint32_t *crc_p, *orig_crc; + int changes; + + console_refresh(); + gc_image = console_get_image(); + + pthread_mutex_lock(&rc->mtx); + if (rc->sending) { + pthread_mutex_unlock(&rc->mtx); + return (1); + } + rc->sending = 1; + pthread_mutex_unlock(&rc->mtx); + + retval = 0; + + if (all) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* + * Calculate the checksum for each 32x32 cell. Send each that + * has changed since the last scan. + */ + + /* Resolution changed */ + + rc->crc_width = gc_image->width; + rc->crc_height = gc_image->height; + + w = rc->crc_width; + h = rc->crc_height; + xcells = howmany(rc->crc_width, PIX_PER_CELL); + ycells = howmany(rc->crc_height, PIX_PER_CELL); + + rem_x = w & PIXCELL_MASK; + + rem_y = h & PIXCELL_MASK; + if (!rem_y) + rem_y = PIX_PER_CELL; + + p = gc_image->data; + + /* + * Go through all cells and calculate crc. If significant number + * of changes, then send entire screen. + * crc_tmp is dual purpose: to store the new crc and to flag as + * a cell that has changed. + */ + crc_p = rc->crc_tmp - xcells; + orig_crc = rc->crc - xcells; + changes = 0; + memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells); + for (y = 0; y < h; y++) { + if ((y & PIXCELL_MASK) == 0) { + crc_p += xcells; + orig_crc += xcells; + } + + for (x = 0; x < xcells; x++) { + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + + if (rc->hw_crc) + crc_p[x] = fast_crc32(p, + cellwidth * sizeof(uint32_t), + crc_p[x]); + else + crc_p[x] = (uint32_t)crc32(crc_p[x], + (Bytef *)p, + cellwidth * sizeof(uint32_t)); + + p += cellwidth; + + /* check for crc delta if last row in cell */ + if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) { + if (orig_crc[x] != crc_p[x]) { + orig_crc[x] = crc_p[x]; + crc_p[x] = 1; + changes++; + } else { + crc_p[x] = 0; + } + } + } + } + + /* If number of changes is > THRESH percent, send the whole screen */ + if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* Go through all cells, and send only changed ones */ + crc_p = rc->crc_tmp; + for (y = 0; y < h; y += PIX_PER_CELL) { + /* previous cell's row */ + celly = (y >> PIXCELL_SHIFT); + + /* Delta check crc to previous set */ + for (x = 0; x < xcells; x++) { + if (*crc_p++ == 0) + continue; + + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + nwrite = rfb_send_rect(rc, cfd, + gc_image, + x * PIX_PER_CELL, + celly * PIX_PER_CELL, + cellwidth, + y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL); + if (nwrite <= 0) { + retval = nwrite; + goto done; + } + } + } + retval = 1; + +done: + pthread_mutex_lock(&rc->mtx); + rc->sending = 0; + pthread_mutex_unlock(&rc->mtx); + + return (retval); +} + + +static void +rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly) +{ + struct rfb_updt_msg updt_msg; + struct bhyvegc_image *gc_image; + + (void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); + + console_refresh(); + gc_image = console_get_image(); + + updt_msg.x = htons(updt_msg.x); + updt_msg.y = htons(updt_msg.y); + updt_msg.width = htons(updt_msg.width); + updt_msg.height = htons(updt_msg.height); + + if (updt_msg.width != gc_image->width || + updt_msg.height != gc_image->height) { + rc->width = gc_image->width; + rc->height = gc_image->height; + if (rc->enc_resize_ok) + rfb_send_resize_update_msg(rc, cfd); + } + + if (discardonly) + return; + + rfb_send_screen(rc, cfd, 1); +} + +static void +rfb_recv_key_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_key_msg key_msg; + + (void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); + + console_key_event(key_msg.down, htonl(key_msg.code)); +} + +static void +rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_ptr_msg ptr_msg; + + (void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + + console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y)); +} + +static void +rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_cuttext_msg ct_msg; + unsigned char buf[32]; + int len; + + len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1); + ct_msg.length = htonl(ct_msg.length); + while (ct_msg.length > 0) { + len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ? + sizeof(buf) : ct_msg.length); + ct_msg.length -= len; + } +} + +static int64_t +timeval_delta(struct timeval *prev, struct timeval *now) +{ + int64_t n1, n2; + n1 = now->tv_sec * 1000000 + now->tv_usec; + n2 = prev->tv_sec * 1000000 + prev->tv_usec; + return (n1 - n2); +} + +static void * +rfb_wr_thr(void *arg) +{ + struct rfb_softc *rc; + fd_set rfds; + struct timeval tv; + struct timeval prev_tv; + int64_t tdiff; + int cfd; + int err; + + rc = arg; + cfd = rc->cfd; + + prev_tv.tv_sec = 0; + prev_tv.tv_usec = 0; + while (rc->cfd >= 0) { + FD_ZERO(&rfds); + FD_SET(cfd, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 10000; + + err = select(cfd+1, &rfds, NULL, NULL, &tv); + if (err < 0) + return (NULL); + + /* Determine if its time to push screen; ~24hz */ + gettimeofday(&tv, NULL); + tdiff = timeval_delta(&prev_tv, &tv); + if (tdiff > 40000) { + prev_tv.tv_sec = tv.tv_sec; + prev_tv.tv_usec = tv.tv_usec; + if (rfb_send_screen(rc, cfd, 0) <= 0) { + return (NULL); + } + } else { + /* sleep */ + usleep(40000 - tdiff); + } + } + + return (NULL); +} + +void +rfb_handle(struct rfb_softc *rc, int cfd) +{ + const char *vbuf = "RFB 003.008\n"; + unsigned char buf[80]; + unsigned char *message = NULL; + +#ifndef NO_OPENSSL + unsigned char challenge[AUTH_LENGTH]; + unsigned char keystr[PASSWD_LENGTH]; + unsigned char crypt_expected[AUTH_LENGTH]; + + DES_key_schedule ks; + int i; +#endif + + pthread_t tid; + uint32_t sres = 0; + int len; + int perror = 1; + + rc->cfd = cfd; + + /* 1a. Send server version */ + stream_write(cfd, vbuf, strlen(vbuf)); + + /* 1b. Read client version */ + len = read(cfd, buf, sizeof(buf)); + + /* 2a. Send security type */ + buf[0] = 1; +#ifndef NO_OPENSSL + if (rc->password) + buf[1] = SECURITY_TYPE_VNC_AUTH; + else + buf[1] = SECURITY_TYPE_NONE; +#else + buf[1] = SECURITY_TYPE_NONE; +#endif + + stream_write(cfd, buf, 2); + + /* 2b. Read agreed security type */ + len = stream_read(cfd, buf, 1); + + /* 2c. Do VNC authentication */ + switch (buf[0]) { + case SECURITY_TYPE_NONE: + sres = 0; + break; + case SECURITY_TYPE_VNC_AUTH: + /* + * The client encrypts the challenge with DES, using a password + * supplied by the user as the key. + * To form the key, the password is truncated to + * eight characters, or padded with null bytes on the right. + * The client then sends the resulting 16-bytes response. + */ +#ifndef NO_OPENSSL + strncpy(keystr, rc->password, PASSWD_LENGTH); + + /* VNC clients encrypts the challenge with all the bit fields + * in each byte of the password mirrored. + * Here we flip each byte of the keystr. + */ + for (i = 0; i < PASSWD_LENGTH; i++) { + keystr[i] = (keystr[i] & 0xF0) >> 4 + | (keystr[i] & 0x0F) << 4; + keystr[i] = (keystr[i] & 0xCC) >> 2 + | (keystr[i] & 0x33) << 2; + keystr[i] = (keystr[i] & 0xAA) >> 1 + | (keystr[i] & 0x55) << 1; + } + + /* Initialize a 16-byte random challenge */ + arc4random_buf(challenge, sizeof(challenge)); + stream_write(cfd, challenge, AUTH_LENGTH); + + /* Receive the 16-byte challenge response */ + stream_read(cfd, buf, AUTH_LENGTH); + + memcpy(crypt_expected, challenge, AUTH_LENGTH); + + /* Encrypt the Challenge with DES */ + DES_set_key((const_DES_cblock *)keystr, &ks); + DES_ecb_encrypt((const_DES_cblock *)challenge, + (const_DES_cblock *)crypt_expected, + &ks, DES_ENCRYPT); + DES_ecb_encrypt((const_DES_cblock *)(challenge + PASSWD_LENGTH), + (const_DES_cblock *)(crypt_expected + + PASSWD_LENGTH), + &ks, DES_ENCRYPT); + + if (memcmp(crypt_expected, buf, AUTH_LENGTH) != 0) { + message = "Auth Failed: Invalid Password."; + sres = htonl(1); + } else + sres = 0; +#else + sres = 0; + WPRINTF(("Auth not supported, no OpenSSL in your system")); +#endif + + break; + } + + /* 2d. Write back a status */ + stream_write(cfd, &sres, 4); + + if (sres) { +#ifdef __FreeBSD__ + be32enc(buf, strlen(message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen(message)); +#else + be32enc(buf, strlen((char *)message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen((char *)message)); +#endif + goto done; + } + + /* 3a. Read client shared-flag byte */ + len = stream_read(cfd, buf, 1); + + /* 4a. Write server-init info */ + rfb_send_server_init_msg(cfd); + + if (!rc->zbuf) { + rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16); + assert(rc->zbuf != NULL); + } + + rfb_send_screen(rc, cfd, 1); + + perror = pthread_create(&tid, NULL, rfb_wr_thr, rc); + if (perror == 0) + pthread_set_name_np(tid, "rfbout"); + + /* Now read in client requests. 1st byte identifies type */ + for (;;) { + len = read(cfd, buf, 1); + if (len <= 0) { + DPRINTF(("rfb client exiting\r\n")); + break; + } + + switch (buf[0]) { + case 0: + rfb_recv_set_pixfmt_msg(rc, cfd); + break; + case 2: + rfb_recv_set_encodings_msg(rc, cfd); + break; + case 3: + rfb_recv_update_msg(rc, cfd, 1); + break; + case 4: + rfb_recv_key_msg(rc, cfd); + break; + case 5: + rfb_recv_ptr_msg(rc, cfd); + break; + case 6: + rfb_recv_cuttext_msg(rc, cfd); + break; + default: + WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff)); + goto done; + } + } +done: + rc->cfd = -1; + if (perror == 0) + pthread_join(tid, NULL); + if (rc->enc_zlib_ok) + deflateEnd(&rc->zstream); +} + +static void * +rfb_thr(void *arg) +{ + struct rfb_softc *rc; + sigset_t set; + + int cfd; + + rc = arg; + + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) { + perror("pthread_sigmask"); + return (NULL); + } + + for (;;) { + rc->enc_raw_ok = false; + rc->enc_zlib_ok = false; + rc->enc_resize_ok = false; + + cfd = accept(rc->sfd, NULL, NULL); + if (rc->conn_wait) { + pthread_mutex_lock(&rc->mtx); + pthread_cond_signal(&rc->cond); + pthread_mutex_unlock(&rc->mtx); + rc->conn_wait = 0; + } + rfb_handle(rc, cfd); + close(cfd); + } + + /* NOTREACHED */ + return (NULL); +} + +static int +sse42_supported(void) +{ + u_int cpu_registers[4], ecx; + + do_cpuid(1, cpu_registers); + + ecx = cpu_registers[2]; + + return ((ecx & CPUID2_SSE42) != 0); +} + +int +rfb_init(char *hostname, int port, int wait, char *password) +{ + int e; + char servname[6]; + struct rfb_softc *rc; + struct addrinfo *ai; + struct addrinfo hints; + int on = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + rc = calloc(1, sizeof(struct rfb_softc)); + + rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + snprintf(servname, sizeof(servname), "%d", port ? port : 5900); + + if (!hostname || strlen(hostname) == 0) +#if defined(INET) + hostname = "127.0.0.1"; +#elif defined(INET6) + hostname = "[::1]"; +#endif + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV | AI_PASSIVE; + + if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) { + fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e)); + return(-1); + } + + rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0); + if (rc->sfd < 0) { + perror("socket"); + freeaddrinfo(ai); + return (-1); + } + + setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + + if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) { + perror("bind"); + freeaddrinfo(ai); + return (-1); + } + + if (listen(rc->sfd, 1) < 0) { + perror("listen"); + freeaddrinfo(ai); + return (-1); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(rc->sfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + pthread_mutex_init(&rc->mtx, NULL); + pthread_cond_init(&rc->cond, NULL); + } + + pthread_create(&rc->tid, NULL, rfb_thr, rc); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + pthread_mutex_lock(&rc->mtx); + pthread_cond_wait(&rc->cond, &rc->mtx); + pthread_mutex_unlock(&rc->mtx); + } + + freeaddrinfo(ai); + return (0); +} + +#ifndef __FreeBSD__ +int +rfb_init_unix(char *path, int wait, char *password) +{ + struct rfb_softc *rc; + struct sockaddr_un sock; + + if ((rc = calloc(1, sizeof (struct rfb_softc))) == NULL) { + perror("calloc"); + return (-1); + } + rc->sfd = -1; + + if ((rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + if ((rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + rc->sfd = socket(PF_UNIX, SOCK_STREAM, 0); + if (rc->sfd < 0) { + perror("socket"); + goto fail; + } + + sock.sun_family = AF_UNIX; + if (strlcpy(sock.sun_path, path, sizeof (sock.sun_path)) >= + sizeof (sock.sun_path)) { + (void) fprintf(stderr, "socket path '%s' too long\n", path); + goto fail; + } + + (void) unlink(path); + if (bind(rc->sfd, (struct sockaddr *)&sock, sizeof (sock)) < 0) { + perror("bind"); + goto fail; + } + + if (listen(rc->sfd, 1) < 0) { + perror("listen"); + goto fail; + } + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + VERIFY3S(pthread_mutex_init(&rc->mtx, NULL), ==, 0); + VERIFY3S(pthread_cond_init(&rc->cond, NULL), ==, 0); + } + + VERIFY3S(pthread_create(&rc->tid, NULL, rfb_thr, rc), ==, 0); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + VERIFY3S(pthread_mutex_lock(&rc->mtx), ==, 0); + VERIFY3S(pthread_cond_wait(&rc->cond, &rc->mtx), ==, 0); + VERIFY3S(pthread_mutex_unlock(&rc->mtx), ==, 0); + } + + return (0); + +fail: + if (rc->sfd != -1) { + VERIFY3S(close(rc->sfd), ==, 0); + } + free(rc->crc); + free(rc->crc_tmp); + free(rc); + return (-1); +} +#endif diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h new file mode 100644 index 0000000000..990e2075ac --- /dev/null +++ b/usr/src/cmd/bhyve/rfb.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RFB_H_ +#define _RFB_H_ + +#define RFB_PORT 5900 + +int rfb_init(char *hostname, int port, int wait, char *password); +#ifndef __FreeBSD__ +int rfb_init_unix(char *path, int wait, char *password); +#endif + +#endif /* _RFB_H_ */ diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c new file mode 100644 index 0000000000..09ca3f61ae --- /dev/null +++ b/usr/src/cmd/bhyve/rtc.c @@ -0,0 +1,131 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <time.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "pci_lpc.h" +#include "rtc.h" + +#define IO_RTC 0x70 + +#define RTC_LMEM_LSB 0x34 +#define RTC_LMEM_MSB 0x35 +#define RTC_HMEM_LSB 0x5b +#define RTC_HMEM_SB 0x5c +#define RTC_HMEM_MSB 0x5d + +#define m_64KB (64*1024) +#define m_16MB (16*1024*1024) +#define m_4GB (4ULL*1024*1024*1024) + +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(struct vmctx *ctx, int use_localtime) +{ + struct tm tm; + time_t t; + + time(&t); + if (use_localtime) { + localtime_r(&t, &tm); + t = timegm(&tm); + } + return (t); +} + +void +rtc_init(struct vmctx *ctx, int use_localtime) +{ + size_t himem; + size_t lomem; + int err; + + /* XXX init diag/reset code/equipment/checksum ? */ + + /* + * Report guest memory size in nvram cells as required by UEFI. + * Little-endian encoding. + * 0x34/0x35 - 64KB chunks above 16MB, below 4GB + * 0x5b/0x5c/0x5d - 64KB chunks above 4GB + */ + lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; + err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); + assert(err == 0); + + himem = vm_get_highmem_size(ctx) / m_64KB; + err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); + assert(err == 0); + + err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + assert(err == 0); +} + +static void +rtc_dsdt(void) +{ + + dsdt_line(""); + dsdt_line("Device (RTC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_RTC, 2); + dsdt_fixed_irq(8); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(rtc_dsdt); + +/* + * Reserve the extended RTC I/O ports although they are not emulated at this + * time. + */ +SYSRES_IO(0x72, 6); diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h new file mode 100644 index 0000000000..1c108eed99 --- /dev/null +++ b/usr/src/cmd/bhyve/rtc.h @@ -0,0 +1,36 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RTC_H_ +#define _RTC_H_ + +void rtc_init(struct vmctx *ctx, int use_localtime); + +#endif /* _RTC_H_ */ diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c new file mode 100644 index 0000000000..35a41a0855 --- /dev/null +++ b/usr/src/cmd/bhyve/smbiostbl.c @@ -0,0 +1,907 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <assert.h> +#include <errno.h> +#include <md5.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <uuid.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "smbiostbl.h" + +#define MB (1024*1024) +#define GB (1024ULL*1024*1024) + +#define SMBIOS_BASE 0xF1000 + +/* BHYVE_ACPI_BASE - SMBIOS_BASE) */ +#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) + +#define SMBIOS_TYPE_BIOS 0 +#define SMBIOS_TYPE_SYSTEM 1 +#define SMBIOS_TYPE_CHASSIS 3 +#define SMBIOS_TYPE_PROCESSOR 4 +#define SMBIOS_TYPE_MEMARRAY 16 +#define SMBIOS_TYPE_MEMDEVICE 17 +#define SMBIOS_TYPE_MEMARRAYMAP 19 +#define SMBIOS_TYPE_BOOT 32 +#define SMBIOS_TYPE_EOT 127 + +struct smbios_structure { + uint8_t type; + uint8_t length; + uint16_t handle; +} __packed; + +typedef int (*initializer_func_t)(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_template_entry { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; +}; + +/* + * SMBIOS Structure Table Entry Point + */ +#define SMBIOS_ENTRY_EANCHOR "_SM_" +#define SMBIOS_ENTRY_EANCHORLEN 4 +#define SMBIOS_ENTRY_IANCHOR "_DMI_" +#define SMBIOS_ENTRY_IANCHORLEN 5 + +struct smbios_entry_point { + char eanchor[4]; /* anchor tag */ + uint8_t echecksum; /* checksum of entry point structure */ + uint8_t eplen; /* length in bytes of entry point */ + uint8_t major; /* major version of the SMBIOS spec */ + uint8_t minor; /* minor version of the SMBIOS spec */ + uint16_t maxssize; /* maximum size in bytes of a struct */ + uint8_t revision; /* entry point structure revision */ + uint8_t format[5]; /* entry point rev-specific data */ + char ianchor[5]; /* intermediate anchor tag */ + uint8_t ichecksum; /* intermediate checksum */ + uint16_t stlen; /* len in bytes of structure table */ + uint32_t staddr; /* physical addr of structure table */ + uint16_t stnum; /* number of structure table entries */ + uint8_t bcdrev; /* BCD value representing DMI ver */ +} __packed; + +/* + * BIOS Information + */ +#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */ +#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */ +#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */ +#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */ +#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */ +#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */ + +#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */ + +#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */ +#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */ + +struct smbios_table_type0 { + struct smbios_structure header; + uint8_t vendor; /* vendor string */ + uint8_t version; /* version string */ + uint16_t segment; /* address segment location */ + uint8_t rel_date; /* release date */ + uint8_t size; /* rom size */ + uint64_t cflags; /* characteristics */ + uint8_t xc_bytes[2]; /* characteristics ext bytes */ + uint8_t sb_major_rel; /* system bios version */ + uint8_t sb_minor_rele; + uint8_t ecfw_major_rel; /* embedded ctrl fw version */ + uint8_t ecfw_minor_rel; +} __packed; + +/* + * System Information + */ +#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */ + +struct smbios_table_type1 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t product; /* product name string */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t uuid[16]; /* uuid byte array */ + uint8_t wakeup; /* wake-up event */ + uint8_t sku; /* sku number string */ + uint8_t family; /* family name string */ +} __packed; + +/* + * System Enclosure or Chassis + */ +#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_CHST_SAFE 0x03 /* safe */ + +#define SMBIOS_CHSC_NONE 0x03 /* none */ + +struct smbios_table_type3 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t type; /* type */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t bustate; /* boot-up state */ + uint8_t psstate; /* power supply state */ + uint8_t tstate; /* thermal state */ + uint8_t security; /* security status */ + uint8_t uheight; /* height in 'u's */ + uint8_t cords; /* number of power cords */ + uint8_t elems; /* number of element records */ + uint8_t elemlen; /* length of records */ + uint8_t sku; /* sku number string */ +} __packed; + +/* + * Processor Information + */ +#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */ + +#define SMBIOS_PRF_OTHER 0x01 /* other */ + +#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */ +#define SMBIOS_PRS_ENABLED 0x1 /* enabled */ + +#define SMBIOS_PRU_NONE 0x06 /* none */ + +#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */ + +struct smbios_table_type4 { + struct smbios_structure header; + uint8_t socket; /* socket designation string */ + uint8_t type; /* processor type */ + uint8_t family; /* processor family */ + uint8_t manufacturer; /* manufacturer string */ + uint64_t cpuid; /* processor cpuid */ + uint8_t version; /* version string */ + uint8_t voltage; /* voltage */ + uint16_t clkspeed; /* ext clock speed in mhz */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint16_t curspeed; /* current speed in mhz */ + uint8_t status; /* status */ + uint8_t upgrade; /* upgrade */ + uint16_t l1handle; /* l1 cache handle */ + uint16_t l2handle; /* l2 cache handle */ + uint16_t l3handle; /* l3 cache handle */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t cores; /* cores per socket */ + uint8_t ecores; /* enabled cores */ + uint8_t threads; /* threads per socket */ + uint16_t cflags; /* processor characteristics */ + uint16_t family2; /* processor family 2 */ +} __packed; + +/* + * Physical Memory Array + */ +#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */ + +#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */ + +#define SMBIOS_MAE_NONE 0x03 /* none */ + +struct smbios_table_type16 { + struct smbios_structure header; + uint8_t location; /* physical device location */ + uint8_t use; /* device functional purpose */ + uint8_t ecc; /* err detect/correct method */ + uint32_t size; /* max mem capacity in kb */ + uint16_t errhand; /* handle of error (if any) */ + uint16_t ndevs; /* num of slots or sockets */ + uint64_t xsize; /* max mem capacity in bytes */ +} __packed; + +/* + * Memory Device + */ +#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */ + +struct smbios_table_type17 { + struct smbios_structure header; + uint16_t arrayhand; /* handle of physl mem array */ + uint16_t errhand; /* handle of mem error data */ + uint16_t twidth; /* total width in bits */ + uint16_t dwidth; /* data width in bits */ + uint16_t size; /* size in bytes */ + uint8_t form; /* form factor */ + uint8_t set; /* set */ + uint8_t dloc; /* device locator string */ + uint8_t bloc; /* phys bank locator string */ + uint8_t type; /* memory type */ + uint16_t flags; /* memory characteristics */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint8_t manufacturer; /* manufacturer string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t attributes; /* attributes */ + uint32_t xsize; /* extended size in mbs */ + uint16_t curspeed; /* current speed in mhz */ + uint16_t minvoltage; /* minimum voltage */ + uint16_t maxvoltage; /* maximum voltage */ + uint16_t curvoltage; /* configured voltage */ +} __packed; + +/* + * Memory Array Mapped Address + */ +struct smbios_table_type19 { + struct smbios_structure header; + uint32_t saddr; /* start phys addr in kb */ + uint32_t eaddr; /* end phys addr in kb */ + uint16_t arrayhand; /* physical mem array handle */ + uint8_t width; /* num of dev in row */ + uint64_t xsaddr; /* start phys addr in bytes */ + uint64_t xeaddr; /* end phys addr in bytes */ +} __packed; + +/* + * System Boot Information + */ +#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */ + +struct smbios_table_type32 { + struct smbios_structure header; + uint8_t reserved[6]; + uint8_t status; /* boot status */ +} __packed; + +/* + * End-of-Table + */ +struct smbios_table_type127 { + struct smbios_structure header; +} __packed; + +struct smbios_table_type0 smbios_type0_template = { + { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 }, + 1, /* bios vendor string */ + 2, /* bios version string */ + 0xF000, /* bios address segment location */ + 3, /* bios release date */ + 0x0, /* bios size (64k * (n + 1) is the size in bytes) */ + SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW | + SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD, + { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM }, + 0x0, /* bios major release */ + 0x0, /* bios minor release */ + 0xff, /* embedded controller firmware major release */ + 0xff /* embedded controller firmware minor release */ +}; + +const char *smbios_type0_strings[] = { + "BHYVE", /* vendor string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ + NULL +}; + +struct smbios_table_type1 smbios_type1_template = { + { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 }, + 1, /* manufacturer string */ + 2, /* product string */ + 3, /* version string */ + 4, /* serial number string */ + { 0 }, + SMBIOS_WAKEUP_SWITCH, + 5, /* sku string */ + 6 /* family string */ +}; + +static int smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +const char *smbios_type1_strings[] = { + " ", /* manufacturer string */ + "BHYVE", /* product name string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* sku string */ + " ", /* family name string */ + NULL +}; + +struct smbios_table_type3 smbios_type3_template = { + { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 }, + 1, /* manufacturer string */ + SMBIOS_CHT_UNKNOWN, + 2, /* version string */ + 3, /* serial number string */ + 4, /* asset tag string */ + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHSC_NONE, + 0, /* height in 'u's (0=enclosure height unspecified) */ + 0, /* number of power cords (0=number unspecified) */ + 0, /* number of contained element records */ + 0, /* length of records */ + 5 /* sku number string */ +}; + +const char *smbios_type3_strings[] = { + " ", /* manufacturer string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* sku number string */ + NULL +}; + +struct smbios_table_type4 smbios_type4_template = { + { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 }, + 1, /* socket designation string */ + SMBIOS_PRT_CENTRAL, + SMBIOS_PRF_OTHER, + 2, /* manufacturer string */ + 0, /* cpuid */ + 3, /* version string */ + 0, /* voltage */ + 0, /* external clock frequency in mhz (0=unknown) */ + 0, /* maximum frequency in mhz (0=unknown) */ + 0, /* current frequency in mhz (0=unknown) */ + SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED, + SMBIOS_PRU_NONE, + -1, /* l1 cache handle */ + -1, /* l2 cache handle */ + -1, /* l3 cache handle */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* cores per socket (0=unknown) */ + 0, /* enabled cores per socket (0=unknown) */ + 0, /* threads per socket (0=unknown) */ + SMBIOS_PFL_64B, + SMBIOS_PRF_OTHER +}; + +const char *smbios_type4_strings[] = { + " ", /* socket designation string */ + " ", /* manufacturer string */ + " ", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type16 smbios_type16_template = { + { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 }, + SMBIOS_MAL_SYSMB, + SMBIOS_MAU_SYSTEM, + SMBIOS_MAE_NONE, + 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */ + -1, /* handle of error (if any) */ + 0, /* number of slots or sockets (TBD) */ + 0 /* extended maximum memory capacity in bytes (TBD) */ +}; + +static int smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type17 smbios_type17_template = { + { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 }, + -1, /* handle of physical memory array */ + -1, /* handle of memory error data */ + 64, /* total width in bits including ecc */ + 64, /* data width in bits */ + 0x7fff, /* size in bytes (0x7fff=use extended)*/ + SMBIOS_MDFF_UNKNOWN, + 0, /* set (0x00=none, 0xff=unknown) */ + 1, /* device locator string */ + 2, /* physical bank locator string */ + SMBIOS_MDT_UNKNOWN, + SMBIOS_MDF_UNKNOWN, + 0, /* maximum memory speed in mhz (0=unknown) */ + 3, /* manufacturer string */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* attributes (0=unknown rank information) */ + 0, /* extended size in mb (TBD) */ + 0, /* current speed in mhz (0=unknown) */ + 0, /* minimum voltage in mv (0=unknown) */ + 0, /* maximum voltage in mv (0=unknown) */ + 0 /* configured voltage in mv (0=unknown) */ +}; + +const char *smbios_type17_strings[] = { + " ", /* device locator string */ + " ", /* physical bank locator string */ + " ", /* manufacturer string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type19 smbios_type19_template = { + { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 }, + 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */ + 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */ + -1, /* physical memory array handle */ + 1, /* number of devices that form a row */ + 0, /* extended starting phys addr in bytes (TDB) */ + 0 /* extended ending phys addr in bytes (TDB) */ +}; + +static int smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type32 smbios_type32_template = { + { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 }, + { 0, 0, 0, 0, 0, 0 }, + SMBIOS_BOOT_NORMAL +}; + +struct smbios_table_type127 smbios_type127_template = { + { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 } +}; + +static int smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_template_entry smbios_template[] = { + { (struct smbios_structure *)&smbios_type0_template, + smbios_type0_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type1_template, + smbios_type1_strings, + smbios_type1_initializer }, + { (struct smbios_structure *)&smbios_type3_template, + smbios_type3_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type4_template, + smbios_type4_strings, + smbios_type4_initializer }, + { (struct smbios_structure *)&smbios_type16_template, + NULL, + smbios_type16_initializer }, + { (struct smbios_structure *)&smbios_type17_template, + smbios_type17_strings, + smbios_type17_initializer }, + { (struct smbios_structure *)&smbios_type19_template, + NULL, + smbios_type19_initializer }, + { (struct smbios_structure *)&smbios_type32_template, + NULL, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type127_template, + NULL, + smbios_generic_initializer }, + { NULL,NULL, NULL } +}; + +static uint64_t guest_lomem, guest_himem; +static uint16_t type16_handle; + +static int +smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_structure *entry; + + memcpy(curaddr, template_entry, template_entry->length); + entry = (struct smbios_structure *)curaddr; + entry->handle = *n + 1; + curaddr += entry->length; + if (template_strings != NULL) { + int i; + + for (i = 0; template_strings[i] != NULL; i++) { + const char *string; + int len; + + string = template_strings[i]; + len = strlen(string) + 1; + memcpy(curaddr, string, len); + curaddr += len; + } + *curaddr = '\0'; + curaddr++; + } else { + /* Minimum string section is double nul */ + *curaddr = '\0'; + curaddr++; + *curaddr = '\0'; + curaddr++; + } + (*n)++; + *endaddr = curaddr; + + return (0); +} + +static int +smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type1 *type1; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type1 = (struct smbios_table_type1 *)curaddr; + + if (guest_uuid_str != NULL) { + uuid_t uuid; + uint32_t status; + + uuid_from_string(guest_uuid_str, &uuid, &status); + if (status != uuid_s_ok) + return (-1); + + uuid_enc_le(&type1->uuid, &uuid); + } else { + MD5_CTX mdctx; + u_char digest[16]; + char hostname[MAXHOSTNAMELEN]; + + /* + * Universally unique and yet reproducible are an + * oxymoron, however reproducible is desirable in + * this case. + */ + if (gethostname(hostname, sizeof(hostname))) + return (-1); + + MD5Init(&mdctx); + MD5Update(&mdctx, vmname, strlen(vmname)); + MD5Update(&mdctx, hostname, sizeof(hostname)); + MD5Final(digest, &mdctx); + + /* + * Set the variant and version number. + */ + digest[6] &= 0x0F; + digest[6] |= 0x30; /* version 3 */ + digest[8] &= 0x3F; + digest[8] |= 0x80; + + memcpy(&type1->uuid, digest, sizeof (digest)); + } + + return (0); +} + +static int +smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + int i; + + for (i = 0; i < guest_ncpus; i++) { + struct smbios_table_type4 *type4; + char *p; + int nstrings, len; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type4 = (struct smbios_table_type4 *)curaddr; + p = curaddr + sizeof (struct smbios_table_type4); + nstrings = 0; + while (p < *endaddr - 1) { + if (*p++ == '\0') + nstrings++; + } + len = sprintf(*endaddr - 1, "CPU #%d", i) + 1; + *endaddr += len - 1; + *(*endaddr) = '\0'; + (*endaddr)++; + type4->socket = nstrings + 1; + curaddr = *endaddr; + } + + return (0); +} + +static int +smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type16 *type16; + + type16_handle = *n; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type16 = (struct smbios_table_type16 *)curaddr; + type16->xsize = guest_lomem + guest_himem; + type16->ndevs = guest_himem > 0 ? 2 : 1; + + return (0); +} + +static int +smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type17 *type17; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_himem; + } + + return (0); +} + +static int +smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type19 *type19; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 0; + type19->xeaddr = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 4*GB; + type19->xeaddr = guest_himem; + } + + return (0); +} + +static void +smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr) +{ + memset(smbios_ep, 0, sizeof(*smbios_ep)); + memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR, + SMBIOS_ENTRY_EANCHORLEN); + smbios_ep->eplen = 0x1F; + assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen); + smbios_ep->major = 2; + smbios_ep->minor = 6; + smbios_ep->revision = 0; + memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR, + SMBIOS_ENTRY_IANCHORLEN); + smbios_ep->staddr = staddr; + smbios_ep->bcdrev = 0x24; +} + +static void +smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len, + uint16_t num, uint16_t maxssize) +{ + uint8_t checksum; + int i; + + smbios_ep->maxssize = maxssize; + smbios_ep->stlen = len; + smbios_ep->stnum = num; + + checksum = 0; + for (i = 0x10; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->ichecksum = checksum; + + checksum = 0; + for (i = 0; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->echecksum = checksum; +} + +int +smbios_build(struct vmctx *ctx) +{ + struct smbios_entry_point *smbios_ep; + uint16_t n; + uint16_t maxssize; + char *curaddr, *startaddr, *ststartaddr; + int i; + int err; + + guest_lomem = vm_get_lowmem_size(ctx); + guest_himem = vm_get_highmem_size(ctx); + + startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "smbios table requires mapped mem\n"); + return (ENOMEM); + } + + curaddr = startaddr; + + smbios_ep = (struct smbios_entry_point *)curaddr; + smbios_ep_initializer(smbios_ep, SMBIOS_BASE + + sizeof(struct smbios_entry_point)); + curaddr += sizeof(struct smbios_entry_point); + ststartaddr = curaddr; + + n = 0; + maxssize = 0; + for (i = 0; smbios_template[i].entry != NULL; i++) { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; + char *endaddr; + uint16_t size; + + entry = smbios_template[i].entry; + strings = smbios_template[i].strings; + initializer = smbios_template[i].initializer; + + err = (*initializer)(entry, strings, curaddr, &endaddr, + &n, &size); + if (err != 0) + return (err); + + if (size > maxssize) + maxssize = size; + + curaddr = endaddr; + } + + assert(curaddr - startaddr < SMBIOS_MAX_LENGTH); + smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize); + + return (0); +} + +int +smbios_parse(const char *opts) +{ + char *buf; + char *lasts; + char *token; + char *end; + long type; + struct { + const char *key; + const char **targetp; + } type1_map[] = { + { "manufacturer", &smbios_type1_strings[0] }, + { "product", &smbios_type1_strings[1] }, + { "version", &smbios_type1_strings[2] }, + { "serial", &smbios_type1_strings[3] }, + { "sku", &smbios_type1_strings[4] }, + { "family", &smbios_type1_strings[5] }, + { "uuid", (const char **)&guest_uuid_str }, + { 0 } + }; + + if ((buf = strdup(opts)) == NULL) { + (void) fprintf(stderr, "out of memory\n"); + return (-1); + } + + if ((token = strtok_r(buf, ",", &lasts)) == NULL) { + (void) fprintf(stderr, "too few fields\n"); + goto fail; + } + + errno = 0; + type = strtol(token, &end, 10); + if (errno != 0 || *end != '\0') { + (void) fprintf(stderr, "first token '%s' is not an integer\n", + token); + goto fail; + } + + /* For now, only type 1 is supported. */ + if (type != 1) { + (void) fprintf(stderr, "unsupported type %d\n", type); + goto fail; + } + + while ((token = strtok_r(NULL, ",", &lasts)) != NULL) { + char *val; + int i; + + if ((val = strchr(token, '=')) == NULL) { + (void) fprintf(stderr, "invalid key=value: '%s'\n", + token); + goto fail; + } + *val = '\0'; + val++; + + for (i = 0; type1_map[i].key != NULL; i++) { + if (strcmp(token, type1_map[i].key) == 0) { + break; + } + } + if (type1_map[i].key == NULL) { + (void) fprintf(stderr, "invalid key '%s'\n", token); + goto fail; + } + *type1_map[i].targetp = val; + } + + return (0); + +fail: + free(buf); + return (-1); +} diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h new file mode 100644 index 0000000000..81e26309e5 --- /dev/null +++ b/usr/src/cmd/bhyve/smbiostbl.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _SMBIOSTBL_H_ +#define _SMBIOSTBL_H_ + +struct vmctx; + +int smbios_build(struct vmctx *ctx); +int smbios_parse(const char *opts); + +#endif /* _SMBIOSTBL_H_ */ diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c new file mode 100644 index 0000000000..b592bce9aa --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.c @@ -0,0 +1,86 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <unistd.h> + +#include <errno.h> + +#include "sockstream.h" + +ssize_t +stream_read(int fd, void *buf, ssize_t nbytes) +{ + uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = read(fd, p + len, nbytes - len); + if (n == 0) + break; + + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} + +ssize_t +stream_write(int fd, const void *buf, ssize_t nbytes) +{ + const uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = write(fd, p + len, nbytes - len); + if (n == 0) + break; + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h new file mode 100644 index 0000000000..ecea849471 --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.h @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/types.h> +#include <unistd.h> + +ssize_t stream_read(int fd, void *buf, ssize_t nbytes); +ssize_t stream_write(int fd, const void *buf, ssize_t nbytes); diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c new file mode 100644 index 0000000000..ecdd05694c --- /dev/null +++ b/usr/src/cmd/bhyve/spinup_ap.c @@ -0,0 +1,110 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +#include "bhyverun.h" +#include "spinup_ap.h" + +static void +spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip) +{ + int vector, error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + vector = *rip >> PAGE_SHIFT; + *rip = 0; + + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + &desc_limit, &desc_access); + assert(error == 0); + + desc_base = vector << PAGE_SHIFT; + error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + assert(error == 0); + + cs = (vector << PAGE_SHIFT) >> 4; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); +} + +int +spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(newcpu != 0); + assert(newcpu < guest_ncpus); + + error = vcpu_reset(ctx, newcpu); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, newcpu); + + /* + * Enable the 'unrestricted guest' mode for 'newcpu'. + * + * Set up the processor state in power-on 16-bit mode, with the CS:IP + * init'd to the specified low-mem 4K page. + */ + error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + spinup_ap_realmode(ctx, newcpu, &rip); + +#ifdef __FreeBSD__ + fbsdrun_addcpu(ctx, vcpu, newcpu, rip); +#else + fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false); +#endif + + return (newcpu); +} diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h new file mode 100644 index 0000000000..226542f6c3 --- /dev/null +++ b/usr/src/cmd/bhyve/spinup_ap.h @@ -0,0 +1,36 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPINUP_AP_H_ +#define _SPINUP_AP_H_ + +int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip); + +#endif diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c new file mode 100644 index 0000000000..b5950a19d8 --- /dev/null +++ b/usr/src/cmd/bhyve/task_switch.c @@ -0,0 +1,941 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> +#include <x86/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include <vmmapi.h> + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ + uint64_t val; + int error; + + error = vm_get_register(ctx, vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + + error = vm_set_register(ctx, vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; + seg_desc.access |= usd->sd_xx << 12; + seg_desc.access |= usd->sd_def32 << 14; + seg_desc.access |= usd->sd_gran << 15; + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread, + int *faultptr) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), + faultptr); + if (error || *faultptr) + return (error); + + if (doread) + vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + else + vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + return (0); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + int segment, struct seg_desc *seg_desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = GETREG(ctx, vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); + if (error || *faultptr) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~PSL_NT; + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(ctx, vcpu); + return (1); + } + } + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + int task_type, uint32_t errcode, int *faultptr) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + *faultptr = 0; + + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp -= bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { + sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + *faultptr = 1; + return (0); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(ctx, vcpu, 1); + *faultptr = 1; + return (0); + } + + error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + iov, nitems(iov), faultptr); + if (error || *faultptr) + return (error); + + vm_copyout(ctx, vcpu, &errcode, iov, bytes); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + */ +#define CHKERR(error,fault) \ + do { \ + assert((error == 0) || (error == EFAULT)); \ + if (error) \ + return (VMEXIT_ABORT); \ + else if (fault) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Calculate the instruction pointer to store in the old TSS. + */ + eip = vmexit->rip + vmexit->inst_length; + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, + &fault); + CHKERR(error, fault); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < minlimit) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, + &fault); + CHKERR(error, fault); + + /* Get the old TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + &ot_desc, &fault); + CHKERR(error, fault); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + &nt_desc, &fault); + CHKERR(error, fault); + } + + /* Update task register to point at the new TSS */ + SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); + + /* Load processor state from new TSS */ + error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, + &fault); + CHKERR(error, fault); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + task_switch->errcode, &fault); + CHKERR(error, fault); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = vm_set_intinfo(ctx, vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/usr/src/cmd/bhyve/test/Makefile b/usr/src/cmd/bhyve/test/Makefile new file mode 100644 index 0000000000..7dbee0c5f3 --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +SUBDIRS = scripts tst + +include Makefile.subdirs diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com new file mode 100644 index 0000000000..3c719bcea7 --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.com @@ -0,0 +1,60 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +include $(SRC)/Makefile.master +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/cmd/Makefile.cmd.64 + +# +# Force c99 for everything +# +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CPPFLAGS = -I$(SRC)/cmd/bhyve \ + -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + $(CPPFLAGS.master) \ + -I$(ROOT)/usr/platform/i86pc/include \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -I$(SRC)/lib/libdladm/common \ + -DWITHOUT_CAPSICUM +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +CLEANFILES += $(EXETESTS) +CLOBBERFILES += $(ROOTTESTS) + +# +# Install related definitions +# +ROOTOPTPKG = $(ROOT)/opt/bhyvetest +ROOTBIN = $(ROOTOPTPKG)/bin +ROOTTST = $(ROOTOPTPKG)/tst +ROOTTSTDIR = $(ROOTTST)/$(TSTDIR) +ROOTTSTEXES = $(EXETESTS:%=$(ROOTTSTDIR)/%) +ROOTTSTSH = $(SHTESTS:%=$(ROOTTSTDIR)/%) +ROOTOUT = $(OUTFILES:%=$(ROOTTSTDIR)/%) +ROOTTESTS = $(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT) +FILEMODE = 0555 +LDLIBS = $(LDLIBS.cmd) +LINTEXE = $(EXETESTS:%.exe=%.exe.ln) diff --git a/usr/src/cmd/bhyve/test/Makefile.subdirs b/usr/src/cmd/bhyve/test/Makefile.subdirs new file mode 100644 index 0000000000..45f0aa67fa --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.subdirs @@ -0,0 +1,29 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +.KEEP_STATE: + +all := TARGET += all +clean := TARGET += clean +clobber := TARGET += clobber +install := TARGET += install +lint := TARGET += lint + +all clean clobber install lint: $(SUBDIRS) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ new file mode 100644 index 0000000000..e3ec55cfdb --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.targ @@ -0,0 +1,55 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +$(ROOTOPTPKG): + $(INS.dir) + +$(ROOTBIN): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTBIN)/%: %.ksh $(ROOTBIN) + $(INS.rename) + +$(ROOTTST): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTTSTDIR): $(ROOTTST) + $(INS.dir) + +$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR) + $(INS.file) + +$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR) + $(INS.file) + +%.exe: %.o $(SUPOBJS) + $(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS) + $(POST_PROCESS) + +$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR) + $(INS.file) + +all: install + +%.exe.ln: %.c $(SUPOBJS) + $(LINT.c) $< $(LDLIBS) + +lint: $(LINTEXE) + +clean: + -$(RM) *.o $(CLEANFILES) + +clobber: clean + -$(RM) $(CLOBBERFILES) diff --git a/usr/src/cmd/bhyve/test/scripts/Makefile b/usr/src/cmd/bhyve/test/scripts/Makefile new file mode 100644 index 0000000000..d28a5edb8f --- /dev/null +++ b/usr/src/cmd/bhyve/test/scripts/Makefile @@ -0,0 +1,28 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +include ../Makefile.com + +SRCS = bhyvetest +SCRIPTS = $(SRCS:%=$(ROOTBIN)/%) + +SCRIPTS := FILEMODE = 0555 +CLOBBERFILES = $(SCRIPTS) + +install: $(SCRIPTS) + +lint: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh new file mode 100644 index 0000000000..95b7743417 --- /dev/null +++ b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh @@ -0,0 +1,231 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +# +# bhyve test suite driver +# +unalias -a + +bt_arg0=$(basename $0) +bt_root="$(cd $(dirname $0)/..; pwd -P)" +bt_ksh="/usr/bin/ksh" +bt_outdir= +bt_keep= +bt_all= +bt_tnum=0 +bt_tfail=0 +bt_tsuc=0 + +function usage +{ + typeset msg="$*" + [[ -z "$msg" ]] || echo "$msg" 2>&1 + cat <<USAGE >&2 +Usage: $bt_arg0 [ -o dir ] [ -k ] [ -a | test ... ] + + -o dir Sets 'dir' as the output directory + -a Runs all tests, ignores tests passed in + -k Keep output from all tests, not just failures + -m mdb binary to test +USAGE + exit 2 +} + +function fatal +{ + typeset msg="$*" + [[ -z "$msg" ]] && msg="failed" + echo "$bt_arg0: $msg" >&2 + exit 1 +} + +function setup_outdir +{ + bt_outdir="$bt_outdir/$bt_arg0.$$" + mkdir -p $bt_outdir || fatal "failed to make output dir $bt_outdir" +} + +function run_single +{ + typeset name=$1 + typeset expect base ext exe command odir res reason + typeset iserr + + [[ -z "$name" ]] && fail "missing test to run" + base=${name##*/} + ext=${base##*.} + expect=${base%%.*} + odir="$bt_outdir/current" + [[ -z "$ext" ]] && fatal "found test without ext: $name" + [[ -z "$expect" ]] && fatal "found test without prefix: $name" + + if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then + iserr="yup" + else + iserr="" + fi + + case "$ext" in + "ksh") + command="$bt_ksh ./$base" + ;; + "exe") + command="./$base" + ;; + "out") + # + # This is the file format for checking output against. + # + return 0 + ;; + *) + echo "skipping test $name (unknown extensino)" + return 0 + ;; + esac + + echo "Executing test $name ... \c" + mkdir -p "$odir" >/dev/null || fatal "can't make output directory" + cd $(dirname $name) || fatal "failed to enter test directory" + $command > "$odir/stdout" 2>"$odir/stderr" + res=$? + cd - > /dev/null || fatal "failed to leave test directory" + + if [[ -f "$name.out" ]] && \ + ! diff "$name.out" "$odir/stdout" >/dev/null; then + cp $name.out $odir/$base.out + reason="stdout mismatch" + elif [[ -n "$iserr" && $res -eq 0 ]]; then + reason="test exited $res, not non-zero" + elif [[ -z "$iserr" && $res -ne 0 ]]; then + reason="test exited $res, not zero" + fi + + if [[ -n "$reason" ]]; then + echo "$reason" + ((bt_tfail++)) + mv "$odir" "$bt_outdir/failure.$bt_tfail" || fatal \ + "failed to move test output directory" + cp "$name" "$bt_outdir/failure.$bt_tfail/$(basename $name)" || \ + fatal "failed to copy test into output directory" + else + echo "passed" + ((bt_tsuc++)) + mv "$odir" "$bt_outdir/success.$bt_tsuc" || fatal \ + "failed to move test directory" + fi + + ((bt_tnum++)) +} + +function run_all +{ + typeset tests t dir + + tests=$(ls -1 $bt_root/tst/*/*.@(ksh|exe)) + for t in $tests; do + run_single $t + done +} + +function welcome +{ + cat <<WELCOME +Starting tests... +output directory: $bt_outdir +WELCOME +} + +function cleanup +{ + [[ -n "$bt_keep" ]] && return + rm -rf "$bt_outdir"/success.* || fatal \ + "failed to remove successful test cases" + if [[ $bt_tfail -eq 0 ]]; then + rmdir "$bt_outdir" || fatal \ + "failed to remove test output directory" + fi +} + +function goodbye +{ + cat <<EOF + +------------- +Results +------------- + +Tests passed: $bt_tsuc +Tests failed: $bt_tfail +Tests ran: $bt_tnum + +EOF + if [[ $bt_tfail -eq 0 ]]; then + echo "Congrats, some tiny parts of bhyve aren't completely" \ + "broken, the tests pass". + else + echo "Some tests failed, you have some work to do." + fi +} + +while getopts ":ahko:m:" c $@; do + case "$c" in + a) + bt_all="y" + ;; + k) + bt_keep="y" + ;; + o) + bt_outdir="$OPTARG" + ;; + h) + usage + ;; + :) + usage "option requires an argument -- $OPTARG" + ;; + *) + usage "invalid option -- $OPTARG" + ;; + esac +done + +shift $((OPTIND-1)) + +[[ -z "$bt_all" && $# == 0 ]] && usage "no tests to run" + +[[ -z "$bt_outdir" ]] && bt_outdir="$PWD" + +setup_outdir +welcome + +if [[ ! -z "$bt_all" ]]; then + run_all +else + for t in $@; do + [[ -f $t ]] || fatal "cannot find test $t" + run_single $t + done +fi + +goodbye +cleanup + +# +# Exit 1 if we have tests that return non-zero +# +[[ $bt_tfai -eq 0 ]] diff --git a/usr/src/cmd/bhyve/test/tst/Makefile b/usr/src/cmd/bhyve/test/tst/Makefile new file mode 100644 index 0000000000..f6a6ec96fc --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +SUBDIRS = mevent + +include ../Makefile.subdirs diff --git a/usr/src/cmd/bhyve/test/tst/mevent/Makefile b/usr/src/cmd/bhyve/test/tst/mevent/Makefile new file mode 100644 index 0000000000..047886bc6a --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/Makefile @@ -0,0 +1,30 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +TSTDIR = mevent +EXETESTS = \ + lists.delete.exe \ + read.disable.exe \ + read.pause.exe \ + read.requeue.exe \ + +SHTESTS = +SUPOBJS = mevent.o testlib.o + +include ../../Makefile.com + +install: $(ROOTTESTS) + +include ../../Makefile.targ diff --git a/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c new file mode 100644 index 0000000000..d09ac133a3 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c @@ -0,0 +1,172 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: lists.delete + * Assertion: mevent_delete() causes the total number of events to decrease + * + * Strategy: 1. Create a pipe. + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will do nothing other than generate an error if it + * is called. + * 3. Create another pipe and add a read event watcher to it. The + * callback will signal a cv when called. A write to the pipe + * followed by a wait on the cv will ensure that async + * operations in mevent.c are complete. See flush_and_wait(). + * 4. Call flush_and_wait(), then get event count. + * 5. Delete the event created in step 2. + * 6. Call flush_and_wait(), then get event count. + * 7. Verify result in step 6 is one less than result in step 4. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static int +get_count(void) +{ + int global = -1, change = -1, del_pending = -1; + int total; + + test_mevent_count_lists(&global, &change, &del_pending); + ASSERT_INT_NEQ(("count not set"), global, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_EQ(("pending delete not processed"), del_pending, 0); + + total = global + change + del_pending; + + VERBOSE(("count = %d (%d + %d + %d)", total, global, change, + del_pending)); + + return (total); +} + +static void +not_called_cb(int fd, enum ev_type ev, void *arg) +{ + FAIL(("this callback should never be called")); +} + +static void +flush_cb(int fd, enum ev_type ev, void *arg) +{ + char buf[32]; + + /* Drain the pipe */ + while (read(fd, buf, sizeof (buf)) > 0) + ; + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + pthread_mutex_unlock(&mtx); +} + +void +flush_and_wait(int fd) +{ + uint8_t msg = 42; + + /* + * Lock taken ahead of waking flush_cb so this thread doesn't race + * with the event thread. + */ + pthread_mutex_lock(&mtx); + if (write(fd, &msg, sizeof (msg)) != sizeof (msg)) { + FAIL(("bad write")); + } + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int unused_pipe[2]; + int flush_pipe[2]; + struct mevent *unused_evp, *flush_evp; + int count1, count2; + + start_test(argv[0], 5); + start_event_thread(); + + /* + * Create first pipe and related event + */ + if (pipe(unused_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("unused_pipe[] = { %d, %d }", unused_pipe[0], unused_pipe[1])); + if (fcntl(unused_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + unused_evp = mevent_add(unused_pipe[0], EVF_READ, not_called_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), unused_evp, NULL); + + /* + * Create flush pipe and related event + */ + if (pipe(flush_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("flush_pipe[] = { %d, %d }", flush_pipe[0], + flush_pipe[1])); + if (fcntl(flush_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + flush_evp = mevent_add(flush_pipe[0], EVF_READ, flush_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), flush_evp, NULL); + + /* Get count before delete. */ + flush_and_wait(flush_pipe[1]); + count1 = get_count(); + + /* + * Delete the first event and flush a read after the delete is + * complete. + */ + if (mevent_delete(unused_evp) != 0) { + FAIL_ERRNO("mevent_delete"); + } + + /* + * Verify count decreased. + */ + flush_and_wait(flush_pipe[1]); + count2 = get_count(); + if (count1 - 1 != count2 ) { + FAIL(("mevent_delete() did not decrease count by 1: " + "was %d, now %d", count1, count2)); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/mevent.c b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c new file mode 100644 index 0000000000..17b6546847 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c @@ -0,0 +1,57 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include "../../../mevent.c" +#include "testlib.h" + +/* + * Returns by reference the number of events on the global and change lists. + * + * Used by tests that wish to ensure that the event count changes as suggested + * by mevent_add() and mevent_delete(). Note that a delete does not immediately + * delete an event. Events that are pending delete are included in the change + * list until the next pass through the change list to process pending changes. + */ +void +test_mevent_count_lists(int *ret_global, int *ret_change, int *ret_del_pending) +{ + struct mevent *mevp; + int global = 0; + int change = 0; + int del_pending = 0; + + mevent_qlock(); + + LIST_FOREACH(mevp, &global_head, me_list) { + global++; + VERBOSE(("on global: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + LIST_FOREACH(mevp, &change_head, me_list) { + change++; + if (mevp->me_state == MEV_DEL_PENDING) { + del_pending++; + } + VERBOSE(("on change: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + mevent_qunlock(); + + *ret_global = global; + *ret_change = change; + *ret_del_pending = del_pending; +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c new file mode 100644 index 0000000000..d23b1af96c --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c @@ -0,0 +1,163 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.cancel + * Assertion: A read is not requeued if mevent_disable() is called while it is + * being handled. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. Write to the pipe then wait for a wakeup. + * 4. From the read event callback, disable the event then awaken + * the main thread. + * 5. In the main thread, add a timer event that will awaken the + * main thread after a short delay. + * 5. Write to the pipe and wait to be awoken. The wakeup should + * come from the timer event, not the read event. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +typedef enum { + CB_NONE, + CB_READ, + CB_TIMER, +} lastwake_t; + +static lastwake_t lastwake = CB_NONE; + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static struct mevent *read_event; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + ssize_t nbytes; + char buf[32] = { 0 }; + int err; + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + err = mevent_disable(read_event); + ASSERT_INT_EQ(("mevent_disable: ", strerror(err)), err, 0); + + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_NONE); + lastwake = CB_READ; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + lastwake = CB_TIMER; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *timer; + ssize_t written; + char *msgs[] = { "first", "second" }; + char *msg; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + /* + * First write + */ + msg = msgs[0]; + read_event = mevent_add(pipefds[0], EVF_READ, munch, msg); + ASSERT_PTR_NEQ(("mevent_add pipefd"), read_event, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for it to be read + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + pthread_mutex_unlock(&mtx); + + /* + * Add timer, second write. + */ + msg = msgs[1]; + timer = mevent_add(50, EVF_TIMER, tick, msg); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for timer to expire + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_TIMER); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c new file mode 100644 index 0000000000..c877f014f6 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c @@ -0,0 +1,152 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.pause + * Assertion: mevent_disable() can be used to pause reads. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char cookie[] = "Chocolate chip with fudge stripes"; + +/* + * After this many bytes are sent, writes will get batched up, progress will be + * made on the write side via an interval timer + */ +const int pauseat = 8; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[sizeof (cookie)] = { 0 }; + ssize_t nbytes; + ssize_t expected; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + /* + * For the first while, expect data to come a byte at a time. After the + * pause, we should get a burst with the rest of the data. + */ + if (i > pauseat) { + expected = strlen(cookie) - pauseat - 1; + } else { + expected = 1; + } + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, expected); + + if (expected == 1) { + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + } else { + ASSERT_STR_EQ(("bad last half of cookie"), buf, &cookie[i]); + } + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp, *timer; + ssize_t written; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add pipefd"), evp, NULL); + + for (int i = 0; cookie[i] != 0; i++) { + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + if (i == pauseat) { + timer = mevent_add(10, EVF_TIMER, tick, + &cookie[pauseat]); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + VERBOSE(("disable munch")); + mevent_disable(evp); + } + } + + pthread_mutex_lock(&mtx); + + mevent_enable(evp); + + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c new file mode 100644 index 0000000000..ddc3e27235 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c @@ -0,0 +1,108 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.requeue + * Assertion: A sequence of writes turns into a sequence of events. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char *cookie = "Chocolate chip with fudge stripes"; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[8] = { 0 }; + ssize_t nbytes; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + ASSERT_INT64_EQ(("bad read: %s", strerror(errno)), nbytes, 1); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, 1); + + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add"), evp, NULL); + + for (int i = 0; cookie[i] != '\0'; i++) { + ssize_t written; + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.c b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c new file mode 100644 index 0000000000..af756d1509 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c @@ -0,0 +1,69 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <pthread.h> +#include <signal.h> +#include <strings.h> +#include <unistd.h> + +#include "testlib.h" +#include "mevent.h" + +const char *testlib_prog; +boolean_t testlib_verbose; + +static void +timed_out(int signo) { + ASSERT_INT_EQ(("timeout signal"), signo, SIGALRM); + + FAIL(("Timed out")); +} + +void +start_test(const char *argv0, uint32_t timeout) +{ + char *val; + + testlib_prog = strrchr(argv0, '/'); + if (testlib_prog == NULL) { + testlib_prog = argv0; + } else { + testlib_prog++; + } + + testlib_verbose = ((val = getenv("TEST_VERBOSE")) != NULL) && + val[0] != '\0'; + + signal(SIGALRM, timed_out); + alarm(timeout); +} + +/* ARGSUSED */ +static void * +event_thread(void *arg) +{ + mevent_dispatch(); + return (NULL); +} + +void +start_event_thread(void) +{ + pthread_t tid; + + if (pthread_create(&tid, NULL, event_thread, NULL) != 0) { + FAIL_ERRNO("pthread_create"); + } +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.h b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h new file mode 100644 index 0000000000..80949f3cc7 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h @@ -0,0 +1,88 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "mevent.h" + +#define EXIT_PASS 0 +#define EXIT_FAIL 1 + +#define VERBOSE(msg) \ + if (testlib_verbose) { \ + (void) printf("VERBOSE %s: %s:%d %s: ", testlib_prog, \ + __FILE__, __LINE__, __func__); \ + (void) printf msg; \ + (void) printf("\n"); \ + } + +#define FAIL_PROLOGUE() \ + (void) printf("FAIL %s: %s:%d: ", testlib_prog, __FILE__, __LINE__) + +#define FAIL(msg) \ + { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf("\n"); \ + exit(EXIT_FAIL); \ + } + +#define FAIL_ERRNO(msg) FAIL((msg ": %s", strerror(errno))) + +#define PASS() \ + { \ + (void) printf("PASS %s\n", testlib_prog); \ + exit(EXIT_PASS); \ + } + +#define ASSERT_CMP(msg, got, cmp, exp, nfmt) \ + if (!(got cmp exp)) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s=" nfmt " %s %s=" nfmt "\n", \ + #got, got, #cmp, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +#define ASSERT_CHAR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%c") +#define ASSERT_INT_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%d") +#define ASSERT_INT_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%d") +#define ASSERT_INT64_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%ld") +#define ASSERT_PTR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%p") +#define ASSERT_PTR_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%p") + +#define ASSERT_STR_EQ(msg, got, exp) \ + if (strcmp(got, exp) != 0) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s='%s' != %s='%s'\n", \ + #got, got, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +extern const char *testlib_prog; +extern boolean_t testlib_verbose; + +extern void start_test(const char *, uint32_t); +extern void start_event_thread(void); +extern void test_mevent_count_lists(int *, int *, int *); diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c new file mode 100644 index 0000000000..1027d0b0f6 --- /dev/null +++ b/usr/src/cmd/bhyve/uart_emul.c @@ -0,0 +1,955 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <dev/ic/ns16550.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#include <capsicum_helpers.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <termios.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <pthread.h> +#include <sysexits.h> +#ifndef __FreeBSD__ +#include <sys/socket.h> +#endif + +#include "mevent.h" +#include "uart_emul.h" + +#define COM1_BASE 0x3F8 +#define COM1_IRQ 4 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 + +#define DEFAULT_RCLK 1843200 +#define DEFAULT_BAUD 9600 + +#define FCR_RX_MASK 0xC0 + +#define MCR_OUT1 0x04 +#define MCR_OUT2 0x08 + +#define MSR_DELTA_MASK 0x0f + +#ifndef REG_SCR +#define REG_SCR com_scr +#endif + +#define FIFOSZ 16 + +static bool uart_stdio; /* stdio in use for i/o */ +static struct termios tio_stdio_orig; + +static struct { + int baseaddr; + int irq; + bool inuse; +} uart_lres[] = { + { COM1_BASE, COM1_IRQ, false}, + { COM2_BASE, COM2_IRQ, false}, +}; + +#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) + +struct fifo { + uint8_t buf[FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of characters in the fifo */ + int size; /* size of the fifo */ +}; + +struct ttyfd { + bool opened; + int fd; /* tty device file descriptor */ + struct termios tio_orig, tio_new; /* I/O Terminals */ +}; + +struct uart_softc { + pthread_mutex_t mtx; /* protects all softc elements */ + uint8_t data; /* Data register (R/W) */ + uint8_t ier; /* Interrupt enable register (R/W) */ + uint8_t lcr; /* Line control register (R/W) */ + uint8_t mcr; /* Modem control register (R/W) */ + uint8_t lsr; /* Line status register (R/W) */ + uint8_t msr; /* Modem status register (R/W) */ + uint8_t fcr; /* FIFO control register (W) */ + uint8_t scr; /* Scratch register (R/W) */ + + uint8_t dll; /* Baudrate divisor latch LSB */ + uint8_t dlh; /* Baudrate divisor latch MSB */ + + struct fifo rxfifo; + struct mevent *mev; + + struct ttyfd tty; +#ifndef __FreeBSD__ + bool sock; + struct { + int clifd; /* console client unix domain socket */ + int servfd; /* console server unix domain socket */ + struct mevent *servmev; /* mevent for server socket */ + } usc_sock; +#endif + + bool thre_int_pending; /* THRE interrupt pending */ + + void *arg; + uart_intr_func_t intr_assert; + uart_intr_func_t intr_deassert; +}; + +static void uart_drain(int fd, enum ev_type ev, void *arg); + +static void +ttyclose(void) +{ + + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); +} + +static void +ttyopen(struct ttyfd *tf) +{ + + tcgetattr(tf->fd, &tf->tio_orig); + + tf->tio_new = tf->tio_orig; + cfmakeraw(&tf->tio_new); + tf->tio_new.c_cflag |= CLOCAL; + tcsetattr(tf->fd, TCSANOW, &tf->tio_new); + + if (tf->fd == STDIN_FILENO) { + tio_stdio_orig = tf->tio_orig; + atexit(ttyclose); + } +} + +static int +ttyread(struct ttyfd *tf) +{ + unsigned char rb; + + if (read(tf->fd, &rb, 1) == 1) + return (rb); + else + return (-1); +} + +static void +ttywrite(struct ttyfd *tf, unsigned char wb) +{ + + (void)write(tf->fd, &wb, 1); +} + +#ifndef __FreeBSD__ +static void +sockwrite(struct uart_softc *sc, unsigned char wb) +{ + (void) write(sc->usc_sock.clifd, &wb, 1); +} +#endif + +static void +rxfifo_reset(struct uart_softc *sc, int size) +{ + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* Flush any unread input from the socket buffer. */ + do { + nread = read(sc->usc_sock.clifd, flushbuf, + sizeof (flushbuf)); + } while (nread == sizeof (flushbuf)); + + /* Enable mevent to trigger when new data available on sock */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ +} + +static int +rxfifo_available(struct uart_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); +} + +static int +rxfifo_putchar(struct uart_softc *sc, uint8_t ch) +{ + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; + + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = ch; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } + return (0); + } else + return (-1); +} + +static int +rxfifo_getchar(struct uart_softc *sc) +{ + struct fifo *fifo; + int c, error, wasfull; + + wasfull = 0; + fifo = &sc->rxfifo; + if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; + c = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } + return (c); + } else + return (-1); +} + +static int +rxfifo_numchars(struct uart_softc *sc) +{ + struct fifo *fifo = &sc->rxfifo; + + return (fifo->num); +} + +static void +uart_opentty(struct uart_softc *sc) +{ + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc); + assert(sc->mev != NULL); +} + +static uint8_t +modem_status(uint8_t mcr) +{ + uint8_t msr; + + if (mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the MCR are + * reflected back into MSR. + */ + msr = 0; + if (mcr & MCR_RTS) + msr |= MSR_CTS; + if (mcr & MCR_DTR) + msr |= MSR_DSR; + if (mcr & MCR_OUT1) + msr |= MSR_RI; + if (mcr & MCR_OUT2) + msr |= MSR_DCD; + } else { + /* + * Always assert DCD and DSR so tty open doesn't block + * even if CLOCAL is turned off. + */ + msr = MSR_DCD | MSR_DSR; + } + assert((msr & MSR_DELTA_MASK) == 0); + + return (msr); +} + +/* + * The IIR returns a prioritized interrupt reason: + * - receive data available + * - transmit holding register empty + * - modem status change + * + * Return an interrupt reason if one is available. + */ +static int +uart_intr_reason(struct uart_softc *sc) +{ + + if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) + return (IIR_RLS); + else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) + return (IIR_RXTOUT); + else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) + return (IIR_TXRDY); + else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) + return (IIR_MLSC); + else + return (IIR_NOPEND); +} + +static void +uart_reset(struct uart_softc *sc) +{ + uint16_t divisor; + + divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; + sc->dll = divisor; + sc->dlh = divisor >> 16; + sc->msr = modem_status(sc->mcr); + + rxfifo_reset(sc, 1); /* no fifo until enabled by software */ +} + +/* + * Toggle the COM port's intr pin depending on whether or not we have an + * interrupt condition to report to the processor. + */ +static void +uart_toggle_intr(struct uart_softc *sc) +{ + uint8_t intr_reason; + + intr_reason = uart_intr_reason(sc); + + if (intr_reason == IIR_NOPEND) + (*sc->intr_deassert)(sc->arg); + else + (*sc->intr_assert)(sc->arg); +} + +static void +uart_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc; + int ch; + + sc = arg; + + assert(fd == sc->tty.fd); + assert(ev == EVF_READ); + + /* + * This routine is called in the context of the mevent thread + * to take out the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) ttyread(&sc->tty); + } else { + while (rxfifo_available(sc) && + ((ch = ttyread(&sc->tty)) != -1)) { + rxfifo_putchar(sc, ch); + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); +} + +void +uart_write(struct uart_softc *sc, int offset, uint8_t value) +{ + int fifosz; + uint8_t msr; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + sc->dll = value; + goto done; + } + + if (offset == REG_DLH) { + sc->dlh = value; + goto done; + } + } + + switch (offset) { + case REG_DATA: + if (sc->mcr & MCR_LOOPBACK) { + if (rxfifo_putchar(sc, value) != 0) + sc->lsr |= LSR_OE; + } else if (sc->tty.opened) { + ttywrite(&sc->tty, value); +#ifndef __FreeBSD__ + } else if (sc->sock) { + sockwrite(sc, value); +#endif + } /* else drop on floor */ + sc->thre_int_pending = true; + break; + case REG_IER: +#ifndef __FreeBSD__ + /* + * Assert an interrupt if re-enabling the THRE intr, since we + * always report THRE as active in the status register. + */ + if ((sc->ier & IER_ETXRDY) == 0 && + (value & IER_ETXRDY) != 0) { + sc->thre_int_pending = true; + } +#endif + /* + * Apply mask so that bits 4-7 are 0 + * Also enables bits 0-3 only if they're 1 + */ + sc->ier = value & 0x0F; + break; + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + rxfifo_reset(sc, fifosz); + } + + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + rxfifo_reset(sc, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + msr = modem_status(sc->mcr); + + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); +} + +uint8_t +uart_read(struct uart_softc *sc, int offset) +{ + uint8_t iir, intr_reason, reg; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + reg = sc->dll; + goto done; + } + + if (offset == REG_DLH) { + reg = sc->dlh; + goto done; + } + } + + switch (offset) { + case REG_DATA: + reg = rxfifo_getchar(sc); + break; + case REG_IER: + reg = sc->ier; + break; + case REG_IIR: + iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; + + intr_reason = uart_intr_reason(sc); + + /* + * Deal with side effects of reading the IIR register + */ + if (intr_reason == IIR_TXRDY) + sc->thre_int_pending = false; + + iir |= intr_reason; + + reg = iir; + break; + case REG_LCR: + reg = sc->lcr; + break; + case REG_MCR: + reg = sc->mcr; + break; + case REG_LSR: + /* Transmitter is always ready for more data */ + sc->lsr |= LSR_TEMT | LSR_THRE; + + /* Check for new receive data */ + if (rxfifo_numchars(sc) > 0) + sc->lsr |= LSR_RXRDY; + else + sc->lsr &= ~LSR_RXRDY; + + reg = sc->lsr; + + /* The LSR_OE bit is cleared on LSR read */ + sc->lsr &= ~LSR_OE; + break; + case REG_MSR: + /* + * MSR delta bits are cleared on read + */ + reg = sc->msr; + sc->msr &= ~MSR_DELTA_MASK; + break; + case REG_SCR: + reg = sc->scr; + break; + default: + reg = 0xFF; + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); + + return (reg); +} + +#ifndef __FreeBSD__ +static void +uart_sock_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc = arg; + char ch; + + /* + * Take the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) read(sc->usc_sock.clifd, &ch, 1); + } else { + bool err_close = false; + + while (rxfifo_available(sc)) { + int res; + + res = read(sc->usc_sock.clifd, &ch, 1); + if (res == 0) { + err_close = true; + break; + } else if (res == -1) { + if (errno != EAGAIN && errno != EINTR) { + err_close = true; + } + break; + } + + rxfifo_putchar(sc, ch); + } + uart_toggle_intr(sc); + + if (err_close) { + (void) fprintf(stderr, "uart: closing client conn\n"); + (void) shutdown(sc->usc_sock.clifd, SHUT_RDWR); + mevent_delete_close(sc->mev); + sc->mev = NULL; + sc->usc_sock.clifd = -1; + } + } + + pthread_mutex_unlock(&sc->mtx); +} + +static void +uart_sock_accept(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc = arg; + int connfd; + + connfd = accept(sc->usc_sock.servfd, NULL, NULL); + if (connfd == -1) { + return; + } + + /* + * Do client connection management under protection of the softc lock + * to avoid racing with concurrent UART events. + */ + pthread_mutex_lock(&sc->mtx); + + if (sc->usc_sock.clifd != -1) { + /* we're already handling a client */ + (void) fprintf(stderr, "uart: unexpected client conn\n"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { + perror("uart: fcntl(O_NONBLOCK)"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + sc->usc_sock.clifd = connfd; + sc->mev = mevent_add(sc->usc_sock.clifd, EVF_READ, + uart_sock_drain, sc); + } + } + + pthread_mutex_unlock(&sc->mtx); +} + +static int +init_sock(const char *path) +{ + int servfd; + struct sockaddr_un servaddr; + + bzero(&servaddr, sizeof (servaddr)); + servaddr.sun_family = AF_UNIX; + + if (strlcpy(servaddr.sun_path, path, sizeof (servaddr.sun_path)) >= + sizeof (servaddr.sun_path)) { + (void) fprintf(stderr, "uart: path '%s' too long\n", + path); + return (-1); + } + + if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + (void) fprintf(stderr, "uart: socket() error - %s\n", + strerror(errno)); + return (-1); + } + (void) unlink(servaddr.sun_path); + + if (bind(servfd, (struct sockaddr *)&servaddr, + sizeof (servaddr)) == -1) { + (void) fprintf(stderr, "uart: bind() error - %s\n", + strerror(errno)); + goto out; + } + + if (listen(servfd, 1) == -1) { + (void) fprintf(stderr, "uart: listen() error - %s\n", + strerror(errno)); + goto out; + } + return (servfd); + +out: + (void) unlink(servaddr.sun_path); + (void) close(servfd); + return (-1); +} +#endif /* not __FreeBSD__ */ + +int +uart_legacy_alloc(int which, int *baseaddr, int *irq) +{ + + if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) + return (-1); + + uart_lres[which].inuse = true; + *baseaddr = uart_lres[which].baseaddr; + *irq = uart_lres[which].irq; + + return (0); +} + +struct uart_softc * +uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, + void *arg) +{ + struct uart_softc *sc; + + sc = calloc(1, sizeof(struct uart_softc)); + + sc->arg = arg; + sc->intr_assert = intr_assert; + sc->intr_deassert = intr_deassert; + + pthread_mutex_init(&sc->mtx, NULL); + + uart_reset(sc); + + return (sc); +} + +static int +uart_tty_backend(struct uart_softc *sc, const char *opts) +{ + int fd; + int retval; + + retval = -1; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd > 0 && isatty(fd)) { + sc->tty.fd = fd; + sc->tty.opened = true; + retval = 0; + } + + return (retval); +} + +#ifndef __FreeBSD__ +static int +uart_sock_backend(struct uart_softc *sc, const char *inopts) +{ + char *opts; + char *opt; + char *nextopt; + char *path = NULL; + + if (strncmp(inopts, "socket,", 7) != 0) { + return (-1); + } + if ((opts = strdup(inopts + 7)) == NULL) { + return (-1); + } + + nextopt = opts; + for (opt = strsep(&nextopt, ","); opt != NULL; + opt = strsep(&nextopt, ",")) { + if (path == NULL && *opt == '/') { + path = opt; + continue; + } + /* + * XXX check for server and client options here. For now, + * everything is a server + */ + free(opts); + return (-1); + } + + sc->usc_sock.clifd = -1; + if ((sc->usc_sock.servfd = init_sock(path)) == -1) { + free(opts); + return (-1); + } + sc->sock = true; + sc->tty.fd = -1; + sc->usc_sock.servmev = mevent_add(sc->usc_sock.servfd, EVF_READ, + uart_sock_accept, sc); + assert(sc->usc_sock.servmev != NULL); + + return (0); +} +#endif /* not __FreeBSD__ */ + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif + + retval = -1; + + if (opts == NULL) + return (0); + + if (strcmp("stdio", opts) == 0) { + if (!uart_stdio) { + sc->tty.fd = STDIN_FILENO; + sc->tty.opened = true; + uart_stdio = true; + retval = 0; + } +#ifndef __FreeBSD__ + } else if (strncmp("socket,", opts, 7) == 0) { + return (uart_sock_backend(sc, opts)); +#endif + } else if (uart_tty_backend(sc, opts) == 0) { + retval = 0; + } + + /* Make the backend file descriptor non-blocking */ + if (retval == 0 && sc->tty.fd != -1) + retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK); + + if (retval == 0) { +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, + CAP_WRITE); + if (caph_rights_limit(sc->tty.fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(sc->tty.fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (!uart_stdio) { + if (caph_limit_stdin() == -1) + errx(EX_OSERR, + "Unable to apply rights for sandbox"); + } +#endif + uart_opentty(sc); + } + + return (retval); +} diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h new file mode 100644 index 0000000000..a87202df1f --- /dev/null +++ b/usr/src/cmd/bhyve/uart_emul.h @@ -0,0 +1,47 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _UART_EMUL_H_ +#define _UART_EMUL_H_ + + +#define UART_IO_BAR_SIZE 8 + +struct uart_softc; + +typedef void (*uart_intr_func_t)(void *arg); +struct uart_softc *uart_init(uart_intr_func_t intr_assert, + uart_intr_func_t intr_deassert, void *arg); + +int uart_legacy_alloc(int unit, int *ioaddr, int *irq); +uint8_t uart_read(struct uart_softc *sc, int offset); +void uart_write(struct uart_softc *sc, int offset, uint8_t value); +int uart_set_backend(struct uart_softc *sc, const char *opt); +#endif diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c new file mode 100644 index 0000000000..6ecdd9530e --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.c @@ -0,0 +1,78 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/queue.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> + +#include "usb_emul.h" + +SET_DECLARE(usb_emu_set, struct usb_devemu); + +struct usb_devemu * +usb_emu_finddev(char *name) +{ + struct usb_devemu **udpp, *udp; + + SET_FOREACH(udpp, usb_emu_set) { + udp = *udpp; + if (!strcmp(udp->ue_emu, name)) + return (udp); + } + + return (NULL); +} + +struct usb_data_xfer_block * +usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, + void *hci_data, int ccs) +{ + struct usb_data_xfer_block *xb; + + if (xfer->ndata >= USB_MAX_XFER_BLOCKS) + return (NULL); + + xb = &xfer->data[xfer->tail]; + xb->buf = buf; + xb->blen = blen; + xb->hci_data = hci_data; + xb->ccs = ccs; + xb->processed = 0; + xb->bdone = 0; + xfer->ndata++; + xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS; + return (xb); +} diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h new file mode 100644 index 0000000000..e55a421b6f --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.h @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _USB_EMUL_H_ +#define _USB_EMUL_H_ + +#include <stdlib.h> +#include <sys/linker_set.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <synch.h> +#endif + +#define USB_MAX_XFER_BLOCKS 8 + +#define USB_XFER_OUT 0 +#define USB_XFER_IN 1 + + + +struct usb_hci; +struct usb_device_request; +struct usb_data_xfer; + +/* Device emulation handlers */ +struct usb_devemu { + char *ue_emu; /* name of device emulation */ + int ue_usbver; /* usb version: 2 or 3 */ + int ue_usbspeed; /* usb device speed */ + + /* instance creation */ + void *(*ue_init)(struct usb_hci *hci, char *opt); + + /* handlers */ + int (*ue_request)(void *sc, struct usb_data_xfer *xfer); + int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir, + int epctx); + int (*ue_reset)(void *sc); + int (*ue_remove)(void *sc); + int (*ue_stop)(void *sc); +}; +#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); + +/* + * USB device events to notify HCI when state changes + */ +enum hci_usbev { + USBDEV_ATTACH, + USBDEV_RESET, + USBDEV_STOP, + USBDEV_REMOVE, +}; + +/* usb controller, ie xhci, ehci */ +struct usb_hci { + int (*hci_intr)(struct usb_hci *hci, int epctx); + int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid, + void *param); + void *hci_sc; /* private softc for hci */ + + /* controller managed fields */ + int hci_address; + int hci_port; +}; + +/* + * Each xfer block is mapped to the hci transfer block. + * On input into the device handler, blen is set to the lenght of buf. + * The device handler is to update blen to reflect on the residual size + * of the buffer, i.e. len(buf) - len(consumed). + */ +struct usb_data_xfer_block { + void *buf; /* IN or OUT pointer */ + int blen; /* in:len(buf), out:len(remaining) */ + int bdone; /* bytes transferred */ + uint32_t processed; /* device processed this + errcode */ + void *hci_data; /* HCI private reference */ + int ccs; + uint32_t streamid; + uint64_t trbnext; /* next TRB guest address */ +}; + +struct usb_data_xfer { + struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS]; + struct usb_device_request *ureq; /* setup ctl request */ + int ndata; /* # of data items */ + int head; + int tail; + pthread_mutex_t mtx; +}; + +enum USB_ERRCODE { + USB_ACK, + USB_NAK, + USB_STALL, + USB_NYET, + USB_ERR, + USB_SHORT +}; + +#define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8 +#define USB_DATA_SET_ERRCODE(x,e) do { \ + (x)->processed = ((x)->processed & 0xFF) | (e << 8); \ + } while (0) + +#define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL) + +#define USB_DATA_XFER_INIT(x) do { \ + memset((x), 0, sizeof(*(x))); \ + pthread_mutex_init(&((x)->mtx), NULL); \ + } while (0) + +#define USB_DATA_XFER_RESET(x) do { \ + memset((x)->data, 0, sizeof((x)->data)); \ + (x)->ndata = 0; \ + (x)->head = (x)->tail = 0; \ + } while (0) + +#define USB_DATA_XFER_LOCK(x) do { \ + pthread_mutex_lock(&((x)->mtx)); \ + } while (0) + +#define USB_DATA_XFER_UNLOCK(x) do { \ + pthread_mutex_unlock(&((x)->mtx)); \ + } while (0) +#ifndef __FreeBSD__ +#define USB_DATA_XFER_LOCK_HELD(x) MUTEX_HELD(&((x)->mtx)) +#endif + +struct usb_devemu *usb_emu_finddev(char *name); + +struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, + void *buf, int blen, void *hci_data, int ccs); + + +#endif /* _USB_EMUL_H_ */ diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c new file mode 100644 index 0000000000..e613012071 --- /dev/null +++ b/usr/src/cmd/bhyve/usb_mouse.c @@ -0,0 +1,802 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/time.h> + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <dev/usb/usb.h> +#include <dev/usb/usbdi.h> + +#include "usb_emul.h" +#include "console.h" +#include "bhyvegc.h" + +static int umouse_debug = 0; +#define DPRINTF(params) if (umouse_debug) printf params +#define WPRINTF(params) printf params + +/* USB endpoint context (1-15) for reporting mouse data events*/ +#define UMOUSE_INTR_ENDPT 1 + +#define UMOUSE_REPORT_DESC_TYPE 0x22 + +#define UMOUSE_GET_REPORT 0x01 +#define UMOUSE_GET_IDLE 0x02 +#define UMOUSE_GET_PROTOCOL 0x03 +#define UMOUSE_SET_REPORT 0x09 +#define UMOUSE_SET_IDLE 0x0A +#define UMOUSE_SET_PROTOCOL 0x0B + +#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +enum { + UMSTR_LANG, + UMSTR_MANUFACTURER, + UMSTR_PRODUCT, + UMSTR_SERIAL, + UMSTR_CONFIG, + UMSTR_MAX +}; + +static const char *umouse_desc_strings[] = { + "\x04\x09", + "BHYVE", + "HID Tablet", + "01", + "HID Tablet Device", +}; + +struct umouse_hid_descriptor { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bcdHID[2]; + uint8_t bCountryCode; + uint8_t bNumDescriptors; + uint8_t bReportDescriptorType; + uint8_t wItemLength[2]; +} __packed; + +struct umouse_config_desc { + struct usb_config_descriptor confd; + struct usb_interface_descriptor ifcd; + struct umouse_hid_descriptor hidd; + struct usb_endpoint_descriptor endpd; + struct usb_endpoint_ss_comp_descriptor sscompd; +} __packed; + +#define MOUSE_MAX_X 0x8000 +#define MOUSE_MAX_Y 0x8000 + +static const uint8_t umouse_report_desc[] = { + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x02, /* USAGE (Mouse) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x09, 0x01, /* USAGE (Pointer) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x05, 0x09, /* USAGE_PAGE (Button) */ + 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x30, /* USAGE (X) */ + 0x09, 0x31, /* USAGE (Y) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */ + 0x75, 0x10, /* REPORT_SIZE (16) */ + 0x95, 0x02, /* REPORT_COUNT (2) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0x05, 0x01, /* USAGE Page (Generic Desktop) */ + 0x09, 0x38, /* USAGE (Wheel) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */ + 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ + 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ + 0x75, 0x08, /* REPORT_SIZE (8) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x06, /* INPUT (Data,Var,Rel) */ + 0xc0, /* END_COLLECTION */ + 0xc0 /* END_COLLECTION */ +}; + +struct umouse_report { + uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */ + int16_t x; /* x position */ + int16_t y; /* y position */ + int8_t z; /* z wheel position */ +} __packed; + + +#define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +static struct usb_device_descriptor umouse_dev_desc = { + .bLength = sizeof(umouse_dev_desc), + .bDescriptorType = UDESC_DEVICE, + MSETW(.bcdUSB, UD_USB_3_0), + .bMaxPacketSize = 8, /* max packet size */ + MSETW(.idVendor, 0xFB5D), /* vendor */ + MSETW(.idProduct, 0x0001), /* product */ + MSETW(.bcdDevice, 0), /* device version */ + .iManufacturer = UMSTR_MANUFACTURER, + .iProduct = UMSTR_PRODUCT, + .iSerialNumber = UMSTR_SERIAL, + .bNumConfigurations = 1, +}; + +static struct umouse_config_desc umouse_confd = { + .confd = { + .bLength = sizeof(umouse_confd.confd), + .bDescriptorType = UDESC_CONFIG, + .wTotalLength[0] = sizeof(umouse_confd), + .bNumInterface = 1, + .bConfigurationValue = 1, + .iConfiguration = UMSTR_CONFIG, + .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP, + .bMaxPower = 0, + }, + .ifcd = { + .bLength = sizeof(umouse_confd.ifcd), + .bDescriptorType = UDESC_INTERFACE, + .bNumEndpoints = 1, + .bInterfaceClass = UICLASS_HID, + .bInterfaceSubClass = UISUBCLASS_BOOT, + .bInterfaceProtocol = UIPROTO_MOUSE, + }, + .hidd = { + .bLength = sizeof(umouse_confd.hidd), + .bDescriptorType = 0x21, + .bcdHID = { 0x01, 0x10 }, + .bCountryCode = 0, + .bNumDescriptors = 1, + .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE, + .wItemLength = { sizeof(umouse_report_desc), 0 }, + }, + .endpd = { + .bLength = sizeof(umouse_confd.endpd), + .bDescriptorType = UDESC_ENDPOINT, + .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT, + .bmAttributes = UE_INTERRUPT, + .wMaxPacketSize[0] = 8, + .bInterval = 0xA, + }, + .sscompd = { + .bLength = sizeof(umouse_confd.sscompd), + .bDescriptorType = UDESC_ENDPOINT_SS_COMP, + .bMaxBurst = 0, + .bmAttributes = 0, + MSETW(.wBytesPerInterval, 0), + }, +}; + + +struct umouse_bos_desc { + struct usb_bos_descriptor bosd; + struct usb_devcap_ss_descriptor usbssd; +} __packed; + + +struct umouse_bos_desc umouse_bosd = { + .bosd = { + .bLength = sizeof(umouse_bosd.bosd), + .bDescriptorType = UDESC_BOS, + HSETW(.wTotalLength, sizeof(umouse_bosd)), + .bNumDeviceCaps = 1, + }, + .usbssd = { + .bLength = sizeof(umouse_bosd.usbssd), + .bDescriptorType = UDESC_DEVICE_CAPABILITY, + .bDevCapabilityType = 3, + .bmAttributes = 0, + HSETW(.wSpeedsSupported, 0x08), + .bFunctionalitySupport = 3, + .bU1DevExitLat = 0xa, /* dummy - not used */ + .wU2DevExitLat = { 0x20, 0x00 }, + } +}; + + +struct umouse_softc { + struct usb_hci *hci; + + char *opt; + + struct umouse_report um_report; + int newdata; + struct { + uint8_t idle; + uint8_t protocol; + uint8_t feature; + } hid; + + pthread_mutex_t mtx; + pthread_mutex_t ev_mtx; + int polling; + struct timeval prev_evt; +}; + +static void +umouse_event(uint8_t button, int x, int y, void *arg) +{ + struct umouse_softc *sc; + struct bhyvegc_image *gc; + + gc = console_get_image(); + if (gc == NULL) { + /* not ready */ + return; + } + + sc = arg; + + pthread_mutex_lock(&sc->mtx); + + sc->um_report.buttons = 0; + sc->um_report.z = 0; + + if (button & 0x01) + sc->um_report.buttons |= 0x01; /* left */ + if (button & 0x02) + sc->um_report.buttons |= 0x04; /* middle */ + if (button & 0x04) + sc->um_report.buttons |= 0x02; /* right */ + if (button & 0x8) + sc->um_report.z = 1; + if (button & 0x10) + sc->um_report.z = -1; + + /* scale coords to mouse resolution */ + sc->um_report.x = MOUSE_MAX_X * x / gc->width; + sc->um_report.y = MOUSE_MAX_Y * y / gc->height; + sc->newdata = 1; + pthread_mutex_unlock(&sc->mtx); + + pthread_mutex_lock(&sc->ev_mtx); + sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT); + pthread_mutex_unlock(&sc->ev_mtx); +} + +static void * +umouse_init(struct usb_hci *hci, char *opt) +{ + struct umouse_softc *sc; + + sc = calloc(1, sizeof(struct umouse_softc)); + sc->hci = hci; + + sc->hid.protocol = 1; /* REPORT protocol */ + sc->opt = strdup(opt); + pthread_mutex_init(&sc->mtx, NULL); + pthread_mutex_init(&sc->ev_mtx, NULL); + + console_ptr_register(umouse_event, sc, 10); + + return (sc); +} + +#define UREQ(x,y) ((x) | ((y) << 8)) + +static int +umouse_request(void *scarg, struct usb_data_xfer *xfer) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + const char *str; + uint16_t value; + uint16_t index; + uint16_t len; + uint16_t slen; + uint8_t *udata; + int err; + int i, idx; + int eshort; + + sc = scarg; + + data = NULL; + udata = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + xfer->data[idx].bdone = 0; + if (data == NULL && USB_DATA_OK(xfer,i)) { + data = &xfer->data[idx]; + udata = data->buf; + } + + xfer->data[idx].processed = 1; + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + + err = USB_ERR_NORMAL_COMPLETION; + eshort = 0; + + if (!xfer->ureq) { + DPRINTF(("umouse_request: port %d\r\n", sc->hci->hci_port)); + goto done; + } + + value = UGETW(xfer->ureq->wValue); + index = UGETW(xfer->ureq->wIndex); + len = UGETW(xfer->ureq->wLength); + + DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, " + "idx 0x%x, len %u\r\n", + sc->hci->hci_port, xfer->ureq->bmRequestType, + xfer->ureq->bRequest, value, index, len)); + + switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) { + case UREQ(UR_GET_CONFIG, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)\r\n")); + if (!data) + break; + + *udata = umouse_confd.confd.bConfigurationValue; + data->blen = len > 0 ? len - 1 : 0; + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x\r\n", + value >> 8)); + if (!data) + break; + + switch (value >> 8) { + case UDESC_DEVICE: + DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= " + "sizeof(umouse_dev_desc) %lu\r\n", + len, sizeof(umouse_dev_desc))); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_dev_desc)) { + data->blen = len - sizeof(umouse_dev_desc); + len = sizeof(umouse_dev_desc); + } else + data->blen = 0; + memcpy(data->buf, &umouse_dev_desc, len); + data->bdone += len; + break; + + case UDESC_CONFIG: + DPRINTF(("umouse: (->UDESC_CONFIG)\r\n")); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_confd)) { + data->blen = len - sizeof(umouse_confd); + len = sizeof(umouse_confd); + } else + data->blen = 0; + + memcpy(data->buf, &umouse_confd, len); + data->bdone += len; + break; + + case UDESC_STRING: + DPRINTF(("umouse: (->UDESC_STRING)\r\n")); + str = NULL; + if ((value & 0xFF) < UMSTR_MAX) + str = umouse_desc_strings[value & 0xFF]; + else + goto done; + + if ((value & 0xFF) == UMSTR_LANG) { + udata[0] = 4; + udata[1] = UDESC_STRING; + data->blen = len - 2; + len -= 2; + data->bdone += 2; + + if (len >= 2) { + udata[2] = str[0]; + udata[3] = str[1]; + data->blen -= 2; + data->bdone += 2; + } else + data->blen = 0; + + goto done; + } + + slen = 2 + strlen(str) * 2; + udata[0] = slen; + udata[1] = UDESC_STRING; + + if (len > slen) { + data->blen = len - slen; + len = slen; + } else + data->blen = 0; + for (i = 2; i < len; i += 2) { + udata[i] = *str++; + udata[i+1] = '\0'; + } + data->bdone += slen; + + break; + + case UDESC_BOS: + DPRINTF(("umouse: USB3 BOS\r\n")); + if (len > sizeof(umouse_bosd)) { + data->blen = len - sizeof(umouse_bosd); + len = sizeof(umouse_bosd); + } else + data->blen = 0; + memcpy(udata, &umouse_bosd, len); + data->bdone += len; + break; + + default: + DPRINTF(("umouse: unknown(%d)->ERROR\r\n", value >> 8)); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + switch (value >> 8) { + case UMOUSE_REPORT_DESC_TYPE: + if (len > sizeof(umouse_report_desc)) { + data->blen = len - sizeof(umouse_report_desc); + len = sizeof(umouse_report_desc); + } else + data->blen = 0; + memcpy(data->buf, umouse_report_desc, len); + data->bdone += len; + break; + default: + DPRINTF(("umouse: IO ERROR\r\n")); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)\r\n")); + if (index != 0) { + DPRINTF(("umouse get_interface, invalid index %d\r\n", + index)); + err = USB_ERR_IOERROR; + goto done; + } + + if (!data) + break; + + if (len > 0) { + *udata = 0; + data->blen = len - 1; + } + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_STATUS, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)\r\n")); + if (data != NULL && len > 1) { + if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP) + USETW(udata, UDS_REMOTE_WAKEUP); + else + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): + case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)\r\n")); + if (data != NULL && len > 1) { + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE): + /* XXX Controller should've handled this */ + DPRINTF(("umouse set address %u\r\n", value)); + break; + + case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE): + DPRINTF(("umouse set config %u\r\n", value)); + break; + + case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): + DPRINTF(("umouse set descriptor %u\r\n", value)); + break; + + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = 0; + break; + + case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP; + break; + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): + case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)\r\n")); + err = USB_ERR_IOERROR; + goto done; + + case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): + DPRINTF(("umouse set interface %u\r\n", value)); + break; + + case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE): + DPRINTF(("umouse set isoch delay %u\r\n", value)); + break; + + case UREQ(UR_SET_SEL, 0): + DPRINTF(("umouse set sel\r\n")); + break; + + case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): + DPRINTF(("umouse synch frame\r\n")); + break; + + /* HID device requests */ + + case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) { + /* TODO read from backend */ + + if (len > sizeof(sc->um_report)) { + data->blen = len - sizeof(sc->um_report); + len = sizeof(sc->um_report); + } else + data->blen = 0; + + memcpy(data->buf, &sc->um_report, len); + data->bdone += len; + } else { + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.idle; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.protocol; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored\r\n")); + break; + + case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE): + sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.idle)); + break; + + case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE): + sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.protocol)); + break; + + default: + DPRINTF(("**** umouse request unhandled\r\n")); + err = USB_ERR_IOERROR; + break; + } + +done: + if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) && + (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL)) + data->blen = 0; + else if (eshort) + err = USB_ERR_SHORT_XFER; + + DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u\r\n", + err, (data ? data->blen : 0), (data ? data->bdone : 0))); + + return (err); +} + +static int +umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir, + int epctx) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + uint8_t *udata; + int len, i, idx; + int err; + + DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d\r\n", + dir ? "IN" : "OUT", epctx, xfer->data[0].blen)); + + + /* find buffer to add data */ + udata = NULL; + err = USB_ERR_NORMAL_COMPLETION; + + /* handle xfer at first unprocessed item with buffer */ + data = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + data = &xfer->data[idx]; + if (data->buf != NULL && data->blen != 0) { + break; + } else { + data->processed = 1; + data = NULL; + } + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + if (!data) + goto done; + + udata = data->buf; + len = data->blen; + + if (udata == NULL) { + DPRINTF(("umouse no buffer provided for input\r\n")); + err = USB_ERR_NOMEM; + goto done; + } + + sc = scarg; + + if (dir) { + + pthread_mutex_lock(&sc->mtx); + + if (!sc->newdata) { + err = USB_ERR_CANCELLED; + USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + + if (sc->polling) { + err = USB_ERR_STALLED; + USB_DATA_SET_ERRCODE(data, USB_STALL); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + sc->polling = 1; + + if (len > 0) { + sc->newdata = 0; + + data->processed = 1; + data->bdone += 6; + memcpy(udata, &sc->um_report, 6); + data->blen = len - 6; + if (data->blen > 0) + err = USB_ERR_SHORT_XFER; + } + + sc->polling = 0; + pthread_mutex_unlock(&sc->mtx); + } else { + USB_DATA_SET_ERRCODE(data, USB_STALL); + err = USB_ERR_STALLED; + } + +done: + return (err); +} + +static int +umouse_reset(void *scarg) +{ + struct umouse_softc *sc; + + sc = scarg; + + sc->newdata = 0; + + return (0); +} + +static int +umouse_remove(void *scarg) +{ + + return (0); +} + +static int +umouse_stop(void *scarg) +{ + + return (0); +} + + +struct usb_devemu ue_mouse = { + .ue_emu = "tablet", + .ue_usbver = 3, + .ue_usbspeed = USB_SPEED_HIGH, + .ue_init = umouse_init, + .ue_request = umouse_request, + .ue_data = umouse_data_handler, + .ue_reset = umouse_reset, + .ue_remove = umouse_remove, + .ue_stop = umouse_stop +}; +USB_EMUL_SET(ue_mouse); diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c new file mode 100644 index 0000000000..314ddeb1e8 --- /dev/null +++ b/usr/src/cmd/bhyve/vga.c @@ -0,0 +1,1357 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <assert.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <machine/vmm.h> + +#include "bhyvegc.h" +#include "console.h" +#include "inout.h" +#include "mem.h" +#include "vga.h" + +#define KB (1024UL) +#define MB (1024 * 1024UL) + +struct vga_softc { + struct mem_range mr; + + struct bhyvegc *gc; + int gc_width; + int gc_height; + struct bhyvegc_image *gc_image; + + uint8_t *vga_ram; + + /* + * General registers + */ + uint8_t vga_misc; + uint8_t vga_sts1; + + /* + * Sequencer + */ + struct { + int seq_index; + uint8_t seq_reset; + uint8_t seq_clock_mode; + int seq_cm_dots; + uint8_t seq_map_mask; + uint8_t seq_cmap_sel; + int seq_cmap_pri_off; + int seq_cmap_sec_off; + uint8_t seq_mm; + } vga_seq; + + /* + * CRT Controller + */ + struct { + int crtc_index; + uint8_t crtc_mode_ctrl; + uint8_t crtc_horiz_total; + uint8_t crtc_horiz_disp_end; + uint8_t crtc_start_horiz_blank; + uint8_t crtc_end_horiz_blank; + uint8_t crtc_start_horiz_retrace; + uint8_t crtc_end_horiz_retrace; + uint8_t crtc_vert_total; + uint8_t crtc_overflow; + uint8_t crtc_present_row_scan; + uint8_t crtc_max_scan_line; + uint8_t crtc_cursor_start; + uint8_t crtc_cursor_on; + uint8_t crtc_cursor_end; + uint8_t crtc_start_addr_high; + uint8_t crtc_start_addr_low; + uint16_t crtc_start_addr; + uint8_t crtc_cursor_loc_low; + uint8_t crtc_cursor_loc_high; + uint16_t crtc_cursor_loc; + uint8_t crtc_vert_retrace_start; + uint8_t crtc_vert_retrace_end; + uint8_t crtc_vert_disp_end; + uint8_t crtc_offset; + uint8_t crtc_underline_loc; + uint8_t crtc_start_vert_blank; + uint8_t crtc_end_vert_blank; + uint8_t crtc_line_compare; + } vga_crtc; + + /* + * Graphics Controller + */ + struct { + int gc_index; + uint8_t gc_set_reset; + uint8_t gc_enb_set_reset; + uint8_t gc_color_compare; + uint8_t gc_rotate; + uint8_t gc_op; + uint8_t gc_read_map_sel; + uint8_t gc_mode; + bool gc_mode_c4; /* chain 4 */ + bool gc_mode_oe; /* odd/even */ + uint8_t gc_mode_rm; /* read mode */ + uint8_t gc_mode_wm; /* write mode */ + uint8_t gc_misc; + uint8_t gc_misc_gm; /* graphics mode */ + uint8_t gc_misc_mm; /* memory map */ + uint8_t gc_color_dont_care; + uint8_t gc_bit_mask; + uint8_t gc_latch0; + uint8_t gc_latch1; + uint8_t gc_latch2; + uint8_t gc_latch3; + } vga_gc; + + /* + * Attribute Controller + */ + struct { + int atc_flipflop; + int atc_index; + uint8_t atc_palette[16]; + uint8_t atc_mode; + uint8_t atc_overscan_color; + uint8_t atc_color_plane_enb; + uint8_t atc_horiz_pixel_panning; + uint8_t atc_color_select; + uint8_t atc_color_select_45; + uint8_t atc_color_select_67; + } vga_atc; + + /* + * DAC + */ + struct { + uint8_t dac_state; + uint8_t dac_rd_index; + uint8_t dac_rd_subindex; + uint8_t dac_wr_index; + uint8_t dac_wr_subindex; + uint8_t dac_palette[3 * 256]; + uint32_t dac_palette_rgb[256]; + } vga_dac; +}; + +static bool +vga_in_reset(struct vga_softc *sc) +{ + return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) || + ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) || + ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) || + ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0)); +} + +static void +vga_check_size(struct bhyvegc *gc, struct vga_softc *sc) +{ + int old_width, old_height; + + if (vga_in_reset(sc)) + return; + + //old_width = sc->gc_width; + //old_height = sc->gc_height; + old_width = sc->gc_image->width; + old_height = sc->gc_image->height; + + /* + * Horizontal Display End: For text modes this is the number + * of characters. For graphics modes this is the number of + * pixels per scanlines divided by the number of pixels per + * character clock. + */ + sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) * + sc->vga_seq.seq_cm_dots; + + sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end | + (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) | + (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1; + + if (old_width != sc->gc_width || old_height != sc->gc_height) + bhyvegc_resize(gc, sc->gc_width, sc->gc_height); +} + +static uint32_t +vga_get_pixel(struct vga_softc *sc, int x, int y) +{ + int offset; + int bit; + uint8_t data; + uint8_t idx; + + offset = (y * sc->gc_width / 8) + (x / 8); + bit = 7 - (x % 8); + + data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) | + (((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) | + (((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) | + (((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3); + + data &= sc->vga_atc.atc_color_plane_enb; + + if (sc->vga_atc.atc_mode & ATC_MC_IPS) { + idx = sc->vga_atc.atc_palette[data] & 0x0f; + idx |= sc->vga_atc.atc_color_select_45; + } else { + idx = sc->vga_atc.atc_palette[data]; + } + idx |= sc->vga_atc.atc_color_select_67; + + return (sc->vga_dac.dac_palette_rgb[idx]); +} + +static void +vga_render_graphics(struct vga_softc *sc) +{ + int x, y; + + for (y = 0; y < sc->gc_height; y++) { + for (x = 0; x < sc->gc_width; x++) { + int offset; + + offset = y * sc->gc_width + x; + sc->gc_image->data[offset] = vga_get_pixel(sc, x, y); + } + } +} + +static uint32_t +vga_get_text_pixel(struct vga_softc *sc, int x, int y) +{ + int dots, offset, bit, font_offset; + uint8_t ch, attr, font; + uint8_t idx; + + dots = sc->vga_seq.seq_cm_dots; + + offset = 2 * sc->vga_crtc.crtc_start_addr; + offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2; + + bit = 7 - (x % dots > 7 ? 7 : x % dots); + + ch = sc->vga_ram[offset + 0 * 64*KB]; + attr = sc->vga_ram[offset + 1 * 64*KB]; + + if (sc->vga_crtc.crtc_cursor_on && + (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) && + ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) && + ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) { + idx = sc->vga_atc.atc_palette[attr & 0xf]; + return (sc->vga_dac.dac_palette_rgb[idx]); + } + + if ((sc->vga_seq.seq_mm & SEQ_MM_EM) && + sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) { + if (attr & 0x8) + font_offset = sc->vga_seq.seq_cmap_pri_off + + (ch << 5) + y % 16; + else + font_offset = sc->vga_seq.seq_cmap_sec_off + + (ch << 5) + y % 16; + attr &= ~0x8; + } else { + font_offset = (ch << 5) + y % 16; + } + + font = sc->vga_ram[font_offset + 2 * 64*KB]; + + if (font & (1 << bit)) + idx = sc->vga_atc.atc_palette[attr & 0xf]; + else + idx = sc->vga_atc.atc_palette[attr >> 4]; + + return (sc->vga_dac.dac_palette_rgb[idx]); +} + +static void +vga_render_text(struct vga_softc *sc) +{ + int x, y; + + for (y = 0; y < sc->gc_height; y++) { + for (x = 0; x < sc->gc_width; x++) { + int offset; + + offset = y * sc->gc_width + x; + sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y); + } + } +} + +void +vga_render(struct bhyvegc *gc, void *arg) +{ + struct vga_softc *sc = arg; + + vga_check_size(gc, sc); + + if (vga_in_reset(sc)) { + memset(sc->gc_image->data, 0, + sc->gc_image->width * sc->gc_image->height * + sizeof (uint32_t)); + return; + } + + if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA)) + vga_render_graphics(sc); + else + vga_render_text(sc); +} + +static uint64_t +vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1) +{ + struct vga_softc *sc = arg1; + uint8_t map_sel; + int offset; + + offset = addr; + switch (sc->vga_gc.gc_misc_mm) { + case 0x0: + /* + * extended mode: base 0xa0000 size 128k + */ + offset -=0xa0000; + offset &= (128 * KB - 1); + break; + case 0x1: + /* + * EGA/VGA mode: base 0xa0000 size 64k + */ + offset -=0xa0000; + offset &= (64 * KB - 1); + break; + case 0x2: + /* + * monochrome text mode: base 0xb0000 size 32kb + */ +#ifdef __FreeBSD__ + assert(0); +#else + abort(); +#endif + case 0x3: + /* + * color text mode and CGA: base 0xb8000 size 32kb + */ + offset -=0xb8000; + offset &= (32 * KB - 1); + break; + } + + /* Fill latches. */ + sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB]; + sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB]; + sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB]; + sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB]; + + if (sc->vga_gc.gc_mode_rm) { + /* read mode 1 */ + assert(0); + } + + map_sel = sc->vga_gc.gc_read_map_sel; + if (sc->vga_gc.gc_mode_oe) { + map_sel |= (offset & 1); + offset &= ~1; + } + + /* read mode 0: return the byte from the selected plane. */ + offset += map_sel * 64*KB; + + return (sc->vga_ram[offset]); +} + +static void +vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1) +{ + struct vga_softc *sc = arg1; + uint8_t c0, c1, c2, c3; + uint8_t m0, m1, m2, m3; + uint8_t set_reset; + uint8_t enb_set_reset; + uint8_t mask; + int offset; + + offset = addr; + switch (sc->vga_gc.gc_misc_mm) { + case 0x0: + /* + * extended mode: base 0xa0000 size 128kb + */ + offset -=0xa0000; + offset &= (128 * KB - 1); + break; + case 0x1: + /* + * EGA/VGA mode: base 0xa0000 size 64kb + */ + offset -=0xa0000; + offset &= (64 * KB - 1); + break; + case 0x2: + /* + * monochrome text mode: base 0xb0000 size 32kb + */ +#ifdef __FreeBSD__ + assert(0); +#else + abort(); +#endif + case 0x3: + /* + * color text mode and CGA: base 0xb8000 size 32kb + */ + offset -=0xb8000; + offset &= (32 * KB - 1); + break; + } + + set_reset = sc->vga_gc.gc_set_reset; + enb_set_reset = sc->vga_gc.gc_enb_set_reset; + + c0 = sc->vga_gc.gc_latch0; + c1 = sc->vga_gc.gc_latch1; + c2 = sc->vga_gc.gc_latch2; + c3 = sc->vga_gc.gc_latch3; + + switch (sc->vga_gc.gc_mode_wm) { + case 0: + /* write mode 0 */ + mask = sc->vga_gc.gc_bit_mask; + + val = (val >> sc->vga_gc.gc_rotate) | + (val << (8 - sc->vga_gc.gc_rotate)); + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (set_reset & 1) ? mask : 0x00; + m1 = (set_reset & 2) ? mask : 0x00; + m2 = (set_reset & 4) ? mask : 0x00; + m3 = (set_reset & 8) ? mask : 0x00; + + c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask); + c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask); + c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask); + c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask); + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = set_reset & 1 ? 0xff : ~mask; + m1 = set_reset & 2 ? 0xff : ~mask; + m2 = set_reset & 4 ? 0xff : ~mask; + m3 = set_reset & 8 ? 0xff : ~mask; + + c0 = enb_set_reset & 1 ? c0 & m0 : val & m0; + c1 = enb_set_reset & 2 ? c1 & m1 : val & m1; + c2 = enb_set_reset & 4 ? c2 & m2 : val & m2; + c3 = enb_set_reset & 8 ? c3 & m3 : val & m3; + break; + case 0x10: /* OR */ + m0 = set_reset & 1 ? mask : 0x00; + m1 = set_reset & 2 ? mask : 0x00; + m2 = set_reset & 4 ? mask : 0x00; + m3 = set_reset & 8 ? mask : 0x00; + + c0 = enb_set_reset & 1 ? c0 | m0 : val | m0; + c1 = enb_set_reset & 2 ? c1 | m1 : val | m1; + c2 = enb_set_reset & 4 ? c2 | m2 : val | m2; + c3 = enb_set_reset & 8 ? c3 | m3 : val | m3; + break; + case 0x18: /* XOR */ + m0 = set_reset & 1 ? mask : 0x00; + m1 = set_reset & 2 ? mask : 0x00; + m2 = set_reset & 4 ? mask : 0x00; + m3 = set_reset & 8 ? mask : 0x00; + + c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0; + c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1; + c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2; + c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3; + break; + } + break; + case 1: + /* write mode 1 */ + break; + case 2: + /* write mode 2 */ + mask = sc->vga_gc.gc_bit_mask; + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 &= ~mask; + c1 &= ~mask; + c2 &= ~mask; + c3 &= ~mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = (val & 1 ? 0xff : 0x00) | ~mask; + m1 = (val & 2 ? 0xff : 0x00) | ~mask; + m2 = (val & 4 ? 0xff : 0x00) | ~mask; + m3 = (val & 8 ? 0xff : 0x00) | ~mask; + + c0 &= m0; + c1 &= m1; + c2 &= m2; + c3 &= m3; + break; + case 0x10: /* OR */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x18: /* XOR */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 ^= m0; + c1 ^= m1; + c2 ^= m2; + c3 ^= m3; + break; + } + break; + case 3: + /* write mode 3 */ + mask = sc->vga_gc.gc_bit_mask & val; + + val = (val >> sc->vga_gc.gc_rotate) | + (val << (8 - sc->vga_gc.gc_rotate)); + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 &= ~mask; + c1 &= ~mask; + c2 &= ~mask; + c3 &= ~mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask; + m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask; + m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask; + m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask; + + c0 &= m0; + c1 &= m1; + c2 &= m2; + c3 &= m3; + break; + case 0x10: /* OR */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x18: /* XOR */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 ^= m0; + c1 ^= m1; + c2 ^= m2; + c3 ^= m3; + break; + } + break; + } + + if (sc->vga_gc.gc_mode_oe) { + if (offset & 1) { + offset &= ~1; + if (sc->vga_seq.seq_map_mask & 2) + sc->vga_ram[offset + 1*64*KB] = c1; + if (sc->vga_seq.seq_map_mask & 8) + sc->vga_ram[offset + 3*64*KB] = c3; + } else { + if (sc->vga_seq.seq_map_mask & 1) + sc->vga_ram[offset + 0*64*KB] = c0; + if (sc->vga_seq.seq_map_mask & 4) + sc->vga_ram[offset + 2*64*KB] = c2; + } + } else { + if (sc->vga_seq.seq_map_mask & 1) + sc->vga_ram[offset + 0*64*KB] = c0; + if (sc->vga_seq.seq_map_mask & 2) + sc->vga_ram[offset + 1*64*KB] = c1; + if (sc->vga_seq.seq_map_mask & 4) + sc->vga_ram[offset + 2*64*KB] = c2; + if (sc->vga_seq.seq_map_mask & 8) + sc->vga_ram[offset + 3*64*KB] = c3; + } +} + +static int +vga_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + if (dir == MEM_F_WRITE) { + switch (size) { + case 1: + vga_mem_wr_handler(ctx, addr, *val, arg1); + break; + case 2: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + break; + case 4: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); + break; + case 8: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); + vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1); + vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1); + vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1); + vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1); + break; + } + } else { + switch (size) { + case 1: + *val = vga_mem_rd_handler(ctx, addr, arg1); + break; + case 2: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + break; + case 4: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; + break; + case 8: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; + *val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32; + *val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40; + *val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48; + *val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56; + break; + } + } + + return (0); +} + +static int +vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, + uint8_t *val, void *arg) +{ + struct vga_softc *sc = arg; + + switch (port) { + case CRTC_IDX_MONO_PORT: + case CRTC_IDX_COLOR_PORT: + *val = sc->vga_crtc.crtc_index; + break; + case CRTC_DATA_MONO_PORT: + case CRTC_DATA_COLOR_PORT: + switch (sc->vga_crtc.crtc_index) { + case CRTC_HORIZ_TOTAL: + *val = sc->vga_crtc.crtc_horiz_total; + break; + case CRTC_HORIZ_DISP_END: + *val = sc->vga_crtc.crtc_horiz_disp_end; + break; + case CRTC_START_HORIZ_BLANK: + *val = sc->vga_crtc.crtc_start_horiz_blank; + break; + case CRTC_END_HORIZ_BLANK: + *val = sc->vga_crtc.crtc_end_horiz_blank; + break; + case CRTC_START_HORIZ_RETRACE: + *val = sc->vga_crtc.crtc_start_horiz_retrace; + break; + case CRTC_END_HORIZ_RETRACE: + *val = sc->vga_crtc.crtc_end_horiz_retrace; + break; + case CRTC_VERT_TOTAL: + *val = sc->vga_crtc.crtc_vert_total; + break; + case CRTC_OVERFLOW: + *val = sc->vga_crtc.crtc_overflow; + break; + case CRTC_PRESET_ROW_SCAN: + *val = sc->vga_crtc.crtc_present_row_scan; + break; + case CRTC_MAX_SCAN_LINE: + *val = sc->vga_crtc.crtc_max_scan_line; + break; + case CRTC_CURSOR_START: + *val = sc->vga_crtc.crtc_cursor_start; + break; + case CRTC_CURSOR_END: + *val = sc->vga_crtc.crtc_cursor_end; + break; + case CRTC_START_ADDR_HIGH: + *val = sc->vga_crtc.crtc_start_addr_high; + break; + case CRTC_START_ADDR_LOW: + *val = sc->vga_crtc.crtc_start_addr_low; + break; + case CRTC_CURSOR_LOC_HIGH: + *val = sc->vga_crtc.crtc_cursor_loc_high; + break; + case CRTC_CURSOR_LOC_LOW: + *val = sc->vga_crtc.crtc_cursor_loc_low; + break; + case CRTC_VERT_RETRACE_START: + *val = sc->vga_crtc.crtc_vert_retrace_start; + break; + case CRTC_VERT_RETRACE_END: + *val = sc->vga_crtc.crtc_vert_retrace_end; + break; + case CRTC_VERT_DISP_END: + *val = sc->vga_crtc.crtc_vert_disp_end; + break; + case CRTC_OFFSET: + *val = sc->vga_crtc.crtc_offset; + break; + case CRTC_UNDERLINE_LOC: + *val = sc->vga_crtc.crtc_underline_loc; + break; + case CRTC_START_VERT_BLANK: + *val = sc->vga_crtc.crtc_start_vert_blank; + break; + case CRTC_END_VERT_BLANK: + *val = sc->vga_crtc.crtc_end_vert_blank; + break; + case CRTC_MODE_CONTROL: + *val = sc->vga_crtc.crtc_mode_ctrl; + break; + case CRTC_LINE_COMPARE: + *val = sc->vga_crtc.crtc_line_compare; + break; + default: + //printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case ATC_IDX_PORT: + *val = sc->vga_atc.atc_index; + break; + case ATC_DATA_PORT: + switch (sc->vga_atc.atc_index) { + case ATC_PALETTE0 ... ATC_PALETTE15: + *val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index]; + break; + case ATC_MODE_CONTROL: + *val = sc->vga_atc.atc_mode; + break; + case ATC_OVERSCAN_COLOR: + *val = sc->vga_atc.atc_overscan_color; + break; + case ATC_COLOR_PLANE_ENABLE: + *val = sc->vga_atc.atc_color_plane_enb; + break; + case ATC_HORIZ_PIXEL_PANNING: + *val = sc->vga_atc.atc_horiz_pixel_panning; + break; + case ATC_COLOR_SELECT: + *val = sc->vga_atc.atc_color_select; + break; + default: + //printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index); + assert(0); + break; + } + break; + case SEQ_IDX_PORT: + *val = sc->vga_seq.seq_index; + break; + case SEQ_DATA_PORT: + switch (sc->vga_seq.seq_index) { + case SEQ_RESET: + *val = sc->vga_seq.seq_reset; + break; + case SEQ_CLOCKING_MODE: + *val = sc->vga_seq.seq_clock_mode; + break; + case SEQ_MAP_MASK: + *val = sc->vga_seq.seq_map_mask; + break; + case SEQ_CHAR_MAP_SELECT: + *val = sc->vga_seq.seq_cmap_sel; + break; + case SEQ_MEMORY_MODE: + *val = sc->vga_seq.seq_mm; + break; + default: + //printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index); + assert(0); + break; + } + break; + case DAC_DATA_PORT: + *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index + + sc->vga_dac.dac_rd_subindex]; + sc->vga_dac.dac_rd_subindex++; + if (sc->vga_dac.dac_rd_subindex == 3) { + sc->vga_dac.dac_rd_index++; + sc->vga_dac.dac_rd_subindex = 0; + } + break; + case GC_IDX_PORT: + *val = sc->vga_gc.gc_index; + break; + case GC_DATA_PORT: + switch (sc->vga_gc.gc_index) { + case GC_SET_RESET: + *val = sc->vga_gc.gc_set_reset; + break; + case GC_ENABLE_SET_RESET: + *val = sc->vga_gc.gc_enb_set_reset; + break; + case GC_COLOR_COMPARE: + *val = sc->vga_gc.gc_color_compare; + break; + case GC_DATA_ROTATE: + *val = sc->vga_gc.gc_rotate; + break; + case GC_READ_MAP_SELECT: + *val = sc->vga_gc.gc_read_map_sel; + break; + case GC_MODE: + *val = sc->vga_gc.gc_mode; + break; + case GC_MISCELLANEOUS: + *val = sc->vga_gc.gc_misc; + break; + case GC_COLOR_DONT_CARE: + *val = sc->vga_gc.gc_color_dont_care; + break; + case GC_BIT_MASK: + *val = sc->vga_gc.gc_bit_mask; + break; + default: + //printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case GEN_MISC_OUTPUT_PORT: + *val = sc->vga_misc; + break; + case GEN_INPUT_STS0_PORT: + assert(0); + break; + case GEN_INPUT_STS1_MONO_PORT: + case GEN_INPUT_STS1_COLOR_PORT: + sc->vga_atc.atc_flipflop = 0; +#ifdef __FreeBSD__ + sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE; + //sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#else + /* + * During the bhyve bring-up process, a guest image was failing + * to successfully boot. It appeared to be spinning, waiting + * for this value to be toggled. Until it can be ruled out + * that this is unnecessary (and documentation seems to + * indicate that it should be present), the toggle should + * remain. + */ + sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#endif + *val = sc->vga_sts1; + break; + case GEN_FEATURE_CTRL_PORT: + // OpenBSD calls this with bytes = 1 + //assert(0); + *val = 0; + break; + case 0x3c3: + *val = 0; + break; + default: + printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port); + //assert(0); + return (-1); + } + + return (0); +} + +static int +vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, + uint8_t val, void *arg) +{ + struct vga_softc *sc = arg; + + switch (port) { + case CRTC_IDX_MONO_PORT: + case CRTC_IDX_COLOR_PORT: + sc->vga_crtc.crtc_index = val; + break; + case CRTC_DATA_MONO_PORT: + case CRTC_DATA_COLOR_PORT: + switch (sc->vga_crtc.crtc_index) { + case CRTC_HORIZ_TOTAL: + sc->vga_crtc.crtc_horiz_total = val; + break; + case CRTC_HORIZ_DISP_END: + sc->vga_crtc.crtc_horiz_disp_end = val; + break; + case CRTC_START_HORIZ_BLANK: + sc->vga_crtc.crtc_start_horiz_blank = val; + break; + case CRTC_END_HORIZ_BLANK: + sc->vga_crtc.crtc_end_horiz_blank = val; + break; + case CRTC_START_HORIZ_RETRACE: + sc->vga_crtc.crtc_start_horiz_retrace = val; + break; + case CRTC_END_HORIZ_RETRACE: + sc->vga_crtc.crtc_end_horiz_retrace = val; + break; + case CRTC_VERT_TOTAL: + sc->vga_crtc.crtc_vert_total = val; + break; + case CRTC_OVERFLOW: + sc->vga_crtc.crtc_overflow = val; + break; + case CRTC_PRESET_ROW_SCAN: + sc->vga_crtc.crtc_present_row_scan = val; + break; + case CRTC_MAX_SCAN_LINE: + sc->vga_crtc.crtc_max_scan_line = val; + break; + case CRTC_CURSOR_START: + sc->vga_crtc.crtc_cursor_start = val; + sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0; + break; + case CRTC_CURSOR_END: + sc->vga_crtc.crtc_cursor_end = val; + break; + case CRTC_START_ADDR_HIGH: + sc->vga_crtc.crtc_start_addr_high = val; + sc->vga_crtc.crtc_start_addr &= 0x00ff; + sc->vga_crtc.crtc_start_addr |= (val << 8); + break; + case CRTC_START_ADDR_LOW: + sc->vga_crtc.crtc_start_addr_low = val; + sc->vga_crtc.crtc_start_addr &= 0xff00; + sc->vga_crtc.crtc_start_addr |= (val & 0xff); + break; + case CRTC_CURSOR_LOC_HIGH: + sc->vga_crtc.crtc_cursor_loc_high = val; + sc->vga_crtc.crtc_cursor_loc &= 0x00ff; + sc->vga_crtc.crtc_cursor_loc |= (val << 8); + break; + case CRTC_CURSOR_LOC_LOW: + sc->vga_crtc.crtc_cursor_loc_low = val; + sc->vga_crtc.crtc_cursor_loc &= 0xff00; + sc->vga_crtc.crtc_cursor_loc |= (val & 0xff); + break; + case CRTC_VERT_RETRACE_START: + sc->vga_crtc.crtc_vert_retrace_start = val; + break; + case CRTC_VERT_RETRACE_END: + sc->vga_crtc.crtc_vert_retrace_end = val; + break; + case CRTC_VERT_DISP_END: + sc->vga_crtc.crtc_vert_disp_end = val; + break; + case CRTC_OFFSET: + sc->vga_crtc.crtc_offset = val; + break; + case CRTC_UNDERLINE_LOC: + sc->vga_crtc.crtc_underline_loc = val; + break; + case CRTC_START_VERT_BLANK: + sc->vga_crtc.crtc_start_vert_blank = val; + break; + case CRTC_END_VERT_BLANK: + sc->vga_crtc.crtc_end_vert_blank = val; + break; + case CRTC_MODE_CONTROL: + sc->vga_crtc.crtc_mode_ctrl = val; + break; + case CRTC_LINE_COMPARE: + sc->vga_crtc.crtc_line_compare = val; + break; + default: + //printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case ATC_IDX_PORT: + if (sc->vga_atc.atc_flipflop == 0) { + if (sc->vga_atc.atc_index & 0x20) + assert(0); + sc->vga_atc.atc_index = val & ATC_IDX_MASK; + } else { + switch (sc->vga_atc.atc_index) { + case ATC_PALETTE0 ... ATC_PALETTE15: + sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f; + break; + case ATC_MODE_CONTROL: + sc->vga_atc.atc_mode = val; + break; + case ATC_OVERSCAN_COLOR: + sc->vga_atc.atc_overscan_color = val; + break; + case ATC_COLOR_PLANE_ENABLE: + sc->vga_atc.atc_color_plane_enb = val; + break; + case ATC_HORIZ_PIXEL_PANNING: + sc->vga_atc.atc_horiz_pixel_panning = val; + break; + case ATC_COLOR_SELECT: + sc->vga_atc.atc_color_select = val; + sc->vga_atc.atc_color_select_45 = + (val & ATC_CS_C45) << 4; + sc->vga_atc.atc_color_select_67 = + ((val & ATC_CS_C67) >> 2) << 6; + break; + default: + //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index); + assert(0); + break; + } + } + sc->vga_atc.atc_flipflop ^= 1; + break; + case ATC_DATA_PORT: + break; + case SEQ_IDX_PORT: + sc->vga_seq.seq_index = val & 0x1f; + break; + case SEQ_DATA_PORT: + switch (sc->vga_seq.seq_index) { + case SEQ_RESET: + sc->vga_seq.seq_reset = val; + break; + case SEQ_CLOCKING_MODE: + sc->vga_seq.seq_clock_mode = val; + sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9; + break; + case SEQ_MAP_MASK: + sc->vga_seq.seq_map_mask = val; + break; + case SEQ_CHAR_MAP_SELECT: + sc->vga_seq.seq_cmap_sel = val; + + sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB; + sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB; + break; + case SEQ_MEMORY_MODE: + sc->vga_seq.seq_mm = val; + /* Windows queries Chain4 */ + //assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); + break; + default: + //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index); + assert(0); + break; + } + break; + case DAC_MASK: + break; + case DAC_IDX_RD_PORT: + sc->vga_dac.dac_rd_index = val; + sc->vga_dac.dac_rd_subindex = 0; + break; + case DAC_IDX_WR_PORT: + sc->vga_dac.dac_wr_index = val; + sc->vga_dac.dac_wr_subindex = 0; + break; + case DAC_DATA_PORT: + sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index + + sc->vga_dac.dac_wr_subindex] = val; + sc->vga_dac.dac_wr_subindex++; + if (sc->vga_dac.dac_wr_subindex == 3) { + sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] = + ((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) | + (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) | + (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0)); + + sc->vga_dac.dac_wr_index++; + sc->vga_dac.dac_wr_subindex = 0; + } + break; + case GC_IDX_PORT: + sc->vga_gc.gc_index = val; + break; + case GC_DATA_PORT: + switch (sc->vga_gc.gc_index) { + case GC_SET_RESET: + sc->vga_gc.gc_set_reset = val; + break; + case GC_ENABLE_SET_RESET: + sc->vga_gc.gc_enb_set_reset = val; + break; + case GC_COLOR_COMPARE: + sc->vga_gc.gc_color_compare = val; + break; + case GC_DATA_ROTATE: + sc->vga_gc.gc_rotate = val; + sc->vga_gc.gc_op = (val >> 3) & 0x3; + break; + case GC_READ_MAP_SELECT: + sc->vga_gc.gc_read_map_sel = val; + break; + case GC_MODE: + sc->vga_gc.gc_mode = val; + sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0; + assert(!sc->vga_gc.gc_mode_c4); + sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0; + sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1; + sc->vga_gc.gc_mode_wm = val & 0x3; + + if (sc->gc_image) + sc->gc_image->vgamode = 1; + break; + case GC_MISCELLANEOUS: + sc->vga_gc.gc_misc = val; + sc->vga_gc.gc_misc_gm = val & GC_MISC_GM; + sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >> + GC_MISC_MM_SHIFT; + break; + case GC_COLOR_DONT_CARE: + sc->vga_gc.gc_color_dont_care = val; + break; + case GC_BIT_MASK: + sc->vga_gc.gc_bit_mask = val; + break; + default: + //printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index); + assert(0); + break; + } + break; + case GEN_INPUT_STS0_PORT: + /* write to Miscellaneous Output Register */ + sc->vga_misc = val; + break; + case GEN_INPUT_STS1_MONO_PORT: + case GEN_INPUT_STS1_COLOR_PORT: + /* write to Feature Control Register */ + break; +// case 0x3c3: +// break; + default: + printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val); + //assert(0); + return (-1); + } + return (0); +} + +static int +vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint8_t val; + int error; + + switch (bytes) { + case 1: + if (in) { + *eax &= ~0xff; + error = vga_port_in_handler(ctx, in, port, 1, + &val, arg); + if (!error) { + *eax |= val & 0xff; + } + } else { + val = *eax & 0xff; + error = vga_port_out_handler(ctx, in, port, 1, + val, arg); + } + break; + case 2: + if (in) { + *eax &= ~0xffff; + error = vga_port_in_handler(ctx, in, port, 1, + &val, arg); + if (!error) { + *eax |= val & 0xff; + } + error = vga_port_in_handler(ctx, in, port + 1, 1, + &val, arg); + if (!error) { + *eax |= (val & 0xff) << 8; + } + } else { + val = *eax & 0xff; + error = vga_port_out_handler(ctx, in, port, 1, + val, arg); + val = (*eax >> 8) & 0xff; + error =vga_port_out_handler(ctx, in, port + 1, 1, + val, arg); + } + break; + default: + assert(0); + return (-1); + } + + return (error); +} + +void * +vga_init(int io_only) +{ + struct inout_port iop; + struct vga_softc *sc; + int port, error; + + sc = calloc(1, sizeof(struct vga_softc)); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "VGA"; + for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) { + iop.port = port; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = vga_port_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + } + + sc->gc_image = console_get_image(); + + /* only handle io ports; vga graphics is disabled */ + if (io_only) + return(sc); + + sc->mr.name = "VGA memory"; + sc->mr.flags = MEM_F_RW; + sc->mr.base = 640 * KB; + sc->mr.size = 128 * KB; + sc->mr.handler = vga_mem_handler; + sc->mr.arg1 = sc; + error = register_mem_fallback(&sc->mr); + assert(error == 0); + + sc->vga_ram = malloc(256 * KB); + memset(sc->vga_ram, 0, 256 * KB); + + { + static uint8_t palette[] = { + 0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a, + 0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a, + 0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f, + 0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f, + }; + int i; + + memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t)); + for (i = 0; i < 16; i++) { + sc->vga_dac.dac_palette_rgb[i] = + ((((sc->vga_dac.dac_palette[3*i + 0] << 2) | + ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) | + (((sc->vga_dac.dac_palette[3*i + 1] << 2) | + ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) | + (((sc->vga_dac.dac_palette[3*i + 2] << 2) | + ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0)); + } + } + + return (sc); +} diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h new file mode 100644 index 0000000000..36c6dc15fa --- /dev/null +++ b/usr/src/cmd/bhyve/vga.h @@ -0,0 +1,162 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VGA_H_ +#define _VGA_H_ + +#define VGA_IOPORT_START 0x3c0 +#define VGA_IOPORT_END 0x3df + +/* General registers */ +#define GEN_INPUT_STS0_PORT 0x3c2 +#define GEN_FEATURE_CTRL_PORT 0x3ca +#define GEN_MISC_OUTPUT_PORT 0x3cc +#define GEN_INPUT_STS1_MONO_PORT 0x3ba +#define GEN_INPUT_STS1_COLOR_PORT 0x3da +#define GEN_IS1_VR 0x08 /* Vertical retrace */ +#define GEN_IS1_DE 0x01 /* Display enable not */ + +/* Attribute controller registers. */ +#define ATC_IDX_PORT 0x3c0 +#define ATC_DATA_PORT 0x3c1 + +#define ATC_IDX_MASK 0x1f +#define ATC_PALETTE0 0 +#define ATC_PALETTE15 15 +#define ATC_MODE_CONTROL 16 +#define ATC_MC_IPS 0x80 /* Internal palette size */ +#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ +#define ATC_OVERSCAN_COLOR 17 +#define ATC_COLOR_PLANE_ENABLE 18 +#define ATC_HORIZ_PIXEL_PANNING 19 +#define ATC_COLOR_SELECT 20 +#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ +#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ + +/* Sequencer registers. */ +#define SEQ_IDX_PORT 0x3c4 +#define SEQ_DATA_PORT 0x3c5 + +#define SEQ_RESET 0 +#define SEQ_RESET_ASYNC 0x1 +#define SEQ_RESET_SYNC 0x2 +#define SEQ_CLOCKING_MODE 1 +#define SEQ_CM_SO 0x20 /* Screen off */ +#define SEQ_CM_89 0x01 /* 8/9 dot clock */ +#define SEQ_MAP_MASK 2 +#define SEQ_CHAR_MAP_SELECT 3 +#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ +#define SEQ_CMS_SAH_SHIFT 5 +#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ +#define SEQ_CMS_SA_SHIFT 2 +#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ +#define SEQ_CMS_SBH_SHIFT 4 +#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ +#define SEQ_CMS_SB_SHIFT 0 +#define SEQ_MEMORY_MODE 4 +#define SEQ_MM_C4 0x08 /* Chain 4 */ +#define SEQ_MM_OE 0x04 /* Odd/even */ +#define SEQ_MM_EM 0x02 /* Extended memory */ + +/* Graphics controller registers. */ +#define GC_IDX_PORT 0x3ce +#define GC_DATA_PORT 0x3cf + +#define GC_SET_RESET 0 +#define GC_ENABLE_SET_RESET 1 +#define GC_COLOR_COMPARE 2 +#define GC_DATA_ROTATE 3 +#define GC_READ_MAP_SELECT 4 +#define GC_MODE 5 +#define GC_MODE_OE 0x10 /* Odd/even */ +#define GC_MODE_C4 0x04 /* Chain 4 */ + +#define GC_MISCELLANEOUS 6 +#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ +#define GC_MISC_MM 0x0c /* memory map */ +#define GC_MISC_MM_SHIFT 2 +#define GC_COLOR_DONT_CARE 7 +#define GC_BIT_MASK 8 + +/* CRT controller registers. */ +#define CRTC_IDX_MONO_PORT 0x3b4 +#define CRTC_DATA_MONO_PORT 0x3b5 +#define CRTC_IDX_COLOR_PORT 0x3d4 +#define CRTC_DATA_COLOR_PORT 0x3d5 + +#define CRTC_HORIZ_TOTAL 0 +#define CRTC_HORIZ_DISP_END 1 +#define CRTC_START_HORIZ_BLANK 2 +#define CRTC_END_HORIZ_BLANK 3 +#define CRTC_START_HORIZ_RETRACE 4 +#define CRTC_END_HORIZ_RETRACE 5 +#define CRTC_VERT_TOTAL 6 +#define CRTC_OVERFLOW 7 +#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ +#define CRTC_OF_VRS9_SHIFT 7 +#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ +#define CRTC_OF_VDE9_SHIFT 6 +#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ +#define CRTC_OF_VRS8_SHIFT 2 +#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ +#define CRTC_OF_VDE8_SHIFT 1 +#define CRTC_PRESET_ROW_SCAN 8 +#define CRTC_MAX_SCAN_LINE 9 +#define CRTC_MSL_MSL 0x1f +#define CRTC_CURSOR_START 10 +#define CRTC_CS_CO 0x20 /* Cursor off */ +#define CRTC_CS_CS 0x1f /* Cursor start */ +#define CRTC_CURSOR_END 11 +#define CRTC_CE_CE 0x1f /* Cursor end */ +#define CRTC_START_ADDR_HIGH 12 +#define CRTC_START_ADDR_LOW 13 +#define CRTC_CURSOR_LOC_HIGH 14 +#define CRTC_CURSOR_LOC_LOW 15 +#define CRTC_VERT_RETRACE_START 16 +#define CRTC_VERT_RETRACE_END 17 +#define CRTC_VRE_MASK 0xf +#define CRTC_VERT_DISP_END 18 +#define CRTC_OFFSET 19 +#define CRTC_UNDERLINE_LOC 20 +#define CRTC_START_VERT_BLANK 21 +#define CRTC_END_VERT_BLANK 22 +#define CRTC_MODE_CONTROL 23 +#define CRTC_MC_TE 0x80 /* Timing enable */ +#define CRTC_LINE_COMPARE 24 + +/* DAC registers */ +#define DAC_MASK 0x3c6 +#define DAC_IDX_RD_PORT 0x3c7 +#define DAC_IDX_WR_PORT 0x3c8 +#define DAC_DATA_PORT 0x3c9 + +void *vga_init(int io_only); + +#endif /* _VGA_H_ */ diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c new file mode 100644 index 0000000000..d3ff5e3951 --- /dev/null +++ b/usr/src/cmd/bhyve/virtio.c @@ -0,0 +1,794 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/uio.h> + +#include <stdio.h> +#include <stdint.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by <https://www.google.com/#output=search&q=virtio+spec> + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_pi = pi; + pi->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + * + * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; + vq->vq_pfn = 0; + vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + if (vs->vs_isr) + pci_lintr_deassert(vs->vs_pi); + vs->vs_isr = 0; + vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; +} + +/* + * Set I/O BAR (usually 0) to map PCI config registers. + */ +void +vi_set_io_bar(struct virtio_softc *vs, int barnum) +{ + size_t size; + + /* + * ??? should we use CFG0 if MSI-X is disabled? + * Existing code did not... + */ + size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; + pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); +} + +/* + * Initialize MSI-X vector capabilities if we're to use MSI-X, + * or MSI capabilities if not. + * + * We assume we want one MSI-X vector per queue, here, plus one + * for the config vec. + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + int nvec; + + if (use_msix) { + vs->vs_flags |= VIRTIO_USE_MSIX; + VS_LOCK(vs); + vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ + VS_UNLOCK(vs); + nvec = vs->vs_vc->vc_nvq + 1; + if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) + return (1); + } else + vs->vs_flags &= ~VIRTIO_USE_MSIX; + + /* Only 1 MSI vector for bhyve */ + pci_emul_add_msicap(vs->vs_pi, 1); + + /* Legacy interrupts are mandatory for virtio devices */ + pci_lintr_request(vs->vs_pi); + + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn << VRING_PFN; + size = vring_size(vq->vq_qsize); + base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_pi->pi_vmctx; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain back to the available queue. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchain(struct vqueue_info *vq) +{ + + vq->vq_last_avail--; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = idx; + vue->vu_tlen = iolen; +#ifndef __FreeBSD__ + /* + * Ensure the used descriptor is visible before updating the index. + * This is necessary on ISAs with memory ordering less strict than x86. + */ + wmb(); +#endif + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; +#ifndef __FreeBSD__ + /* + * Use full memory barrier between vu_idx store from preceding + * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or + * va_flags below. + */ + mb(); +#endif + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_size; /* size (bytes) */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, + { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, + { VTCFG_R_PFN, 4, 0, "PFN" }, + { VTCFG_R_QNUM, 2, 1, "QNUM" }, + { VTCFG_R_QSEL, 2, 0, "QSEL" }, + { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, + { VTCFG_R_STATUS, 1, 0, "STATUS" }, + { VTCFG_R_ISR, 1, 0, "ISR" }, + { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, + { VTCFG_R_QVEC, 2, 0, "QVEC" }, +}; + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +/* + * Handle pci config space reads. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = pi->pi_arg; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + uint32_t value; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + * If that fails, fall into general code. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size) { + if (cr != NULL) { + /* offset must be OK, so size must be bad */ + fprintf(stderr, + "%s: read from %s: bad size %d\r\n", + name, cr->cr_name, size); + } else { + fprintf(stderr, + "%s: read from bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + value = vc->vc_hv_caps; + break; + case VTCFG_R_GUESTCAP: + value = vs->vs_negotiated_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq < vc->vc_nvq) + value = vs->vs_queues[vs->vs_curq].vq_pfn; + break; + case VTCFG_R_QNUM: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + break; + case VTCFG_R_QSEL: + value = vs->vs_curq; + break; + case VTCFG_R_QNOTIFY: + value = 0; /* XXX */ + break; + case VTCFG_R_STATUS: + value = vs->vs_status; + break; + case VTCFG_R_ISR: + value = vs->vs_isr; + vs->vs_isr = 0; /* a read clears this flag */ + if (value) + pci_lintr_deassert(pi); + break; + case VTCFG_R_CFGVEC: + value = vs->vs_msix_cfg_idx; + break; + case VTCFG_R_QVEC: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_msix_idx : + VIRTIO_MSI_NO_VECTOR; + break; + } +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = pi->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size || cr->cr_ro) { + if (cr != NULL) { + /* offset must be OK, wrong size and/or reg is R/O */ + if (cr->cr_size != size) + fprintf(stderr, + "%s: write to %s: bad size %d\r\n", + name, cr->cr_name, size); + if (cr->cr_ro) + fprintf(stderr, + "%s: write to read-only reg %s\r\n", + name, cr->cr_name); + } else { + fprintf(stderr, + "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + break; + case VTCFG_R_PFN: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vi_vq_init(vs, value); + break; + case VTCFG_R_QSEL: + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VTCFG_R_QNOTIFY: + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + goto done; + } + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + case VTCFG_R_STATUS: + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VTCFG_R_CFGVEC: + vs->vs_msix_cfg_idx = value; + break; + case VTCFG_R_QVEC: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_msix_idx = value; + break; + } + goto done; + +bad_qindex: + fprintf(stderr, + "%s: write config reg %s: curq %d >= max %d\r\n", + name, cr->cr_name, vs->vs_curq, vc->vc_nvq); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h new file mode 100644 index 0000000000..a2c3362ec2 --- /dev/null +++ b/usr/src/cmd/bhyve/virtio.h @@ -0,0 +1,484 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#include <pthread_np.h> + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": <N> descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; <N> uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; <N> elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number <N> that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If <N> is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The <N> 16-byte "desc" descriptors consist of a 64-bit guest + * physical address <addr>, a 32-bit length <len>, a 16-bit + * <flags>, and a 16-bit <next> field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, <len> is the number of bytes that may + * be read/written from guest physical address <addr>. If + * INDIRECT is set, WRITE is ignored and <len> provides the length + * of the indirect descriptors (and <len> must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each <next> value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each <next> must be in the range [0 .. <len>/16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit <flags> field and 16-bit index <idx>, then + * have <N> 16-bit <ring> values, followed by one final 16-bit + * field <used_event>. The <N> <ring> entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each <next> value). However, <idx> is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod <N> to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit <flags> and 16-bit <idx>, then there + * are <N> "vring_used" elements, followed by a 16-bit <avail_event>. + * The <N> "vring_used" elements consist of a 32-bit <id> and a + * 32-bit <len> (vu_tlen below). The <id> is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the <len> is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, <used_event> and <avail_event>, in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their <flags> field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with <dev/virtio/virtio.h> defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_RANDOM 0x1005 +#define VIRTIO_DEV_SCSI 0x1008 + +/* + * PCI config space constants. + * + * If MSI-X is enabled, the ISR register is generally not used, + * and the configuration vector and queue vector appear at offsets + * 20 and 22 with the remaining configuration registers at 24. + * If MSI-X is not enabled, those two registers disappear and + * the remaining configuration registers start at offset 20. + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, VRING_ALIGN); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, VRING_ALIGN); + + return (size); +} + +struct vmctx; +struct pci_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct pci_devinst *vs_pi; /* PCI device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + uint8_t vs_status; /* value from last status write */ + uint8_t vs_isr; /* ISR flags, if not MSI-X */ + uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ + +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + + if (pci_msix_enabled(vs->vs_pi)) + pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); + else { +#ifndef __FreeBSD__ + boolean_t unlock = B_FALSE; + + if (vs->vs_mtx && !pthread_mutex_isowned_np(vs->vs_mtx)) { + unlock = B_TRUE; + pthread_mutex_lock(vs->vs_mtx); + } +#else + VS_LOCK(vs); +#endif + vs->vs_isr |= VTCFG_ISR_QUEUES; + pci_generate_msi(vs->vs_pi, 0); + pci_lintr_assert(vs->vs_pi); +#ifndef __FreeBSD__ + if (unlock) + pthread_mutex_unlock(vs->vs_mtx); +#else + VS_UNLOCK(vs); +#endif + } +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_bar(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_retchain(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size); +void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value); +#endif /* _VIRTIO_H_ */ diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c new file mode 100644 index 0000000000..3278ea591c --- /dev/null +++ b/usr/src/cmd/bhyve/xmsr.c @@ -0,0 +1,239 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <machine/cpufunc.h> +#include <machine/vmm.h> +#include <machine/specialreg.h> + +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xmsr.h" + +static int cpu_vendor_intel, cpu_vendor_amd; + +int +emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val) +{ + + if (cpu_vendor_intel) { + switch (num) { +#ifndef __FreeBSD__ + case MSR_PERFCTR0: + case MSR_PERFCTR1: + case MSR_EVNTSEL0: + case MSR_EVNTSEL1: + return (0); +#endif + case 0xd04: /* Sandy Bridge uncore PMCs */ + case 0xc24: + return (0); + case MSR_BIOS_UPDT_TRIG: + return (0); + case MSR_BIOS_SIGN: + return (0); + default: + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_HWCR: + /* + * Ignore writes to hardware configuration MSR. + */ + return (0); + + case MSR_NB_CFG1: + case MSR_IC_CFG: + return (0); /* Ignore writes */ + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* Ignore writes to the PerfEvtSel MSRs */ + return (0); + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* Ignore writes to the PerfCtr MSRs */ + return (0); + + case MSR_P_STATE_CONTROL: + /* Ignore write to change the P-state */ + return (0); + + default: + break; + } + } + return (-1); +} + +int +emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) +{ + int error = 0; + + if (cpu_vendor_intel) { + switch (num) { + case MSR_BIOS_SIGN: + case MSR_IA32_PLATFORM_ID: + case MSR_PKG_ENERGY_STATUS: + case MSR_PP0_ENERGY_STATUS: + case MSR_PP1_ENERGY_STATUS: + case MSR_DRAM_ENERGY_STATUS: + *val = 0; + break; + case MSR_RAPL_POWER_UNIT: + /* + * Use the default value documented in section + * "RAPL Interfaces" in Intel SDM vol3. + */ + *val = 0x000a1003; + break; + default: + error = -1; + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_BIOS_SIGN: + *val = 0; + break; + case MSR_HWCR: + /* + * Bios and Kernel Developer's Guides for AMD Families + * 12H, 14H, 15H and 16H. + */ + *val = 0x01000010; /* Reset value */ + *val |= 1 << 9; /* MONITOR/MWAIT disable */ + break; + + case MSR_NB_CFG1: + case MSR_IC_CFG: + /* + * The reset value is processor family dependent so + * just return 0. + */ + *val = 0; + break; + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* + * PerfEvtSel MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* + * PerfCtr MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_SMM_ADDR: + case MSR_SMM_MASK: + /* + * Return the reset value defined in the AMD Bios and + * Kernel Developer's Guide. + */ + *val = 0; + break; + + case MSR_P_STATE_LIMIT: + case MSR_P_STATE_CONTROL: + case MSR_P_STATE_STATUS: + case MSR_P_STATE_CONFIG(0): /* P0 configuration */ + *val = 0; + break; + + /* + * OpenBSD guests test bit 0 of this MSR to detect if the + * workaround for erratum 721 is already applied. + * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + */ + case 0xC0011029: + *val = 1; + break; + + default: + error = -1; + break; + } + } else { + error = -1; + } + return (error); +} + +int +init_msr(void) +{ + int error; + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + error = 0; + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + cpu_vendor_amd = 1; + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + cpu_vendor_intel = 1; + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + error = -1; + } + return (error); +} diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h new file mode 100644 index 0000000000..1fb47c3ae2 --- /dev/null +++ b/usr/src/cmd/bhyve/xmsr.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _XMSR_H_ +#define _XMSR_H_ + +int init_msr(void); +int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); +int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val); + +#endif diff --git a/usr/src/cmd/bhyve/zhyve.c b/usr/src/cmd/bhyve/zhyve.c new file mode 100644 index 0000000000..d3e764b14d --- /dev/null +++ b/usr/src/cmd/bhyve/zhyve.c @@ -0,0 +1,167 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +/* + * This small 'zhyve' stub is init for the zone: we therefore need to pick up + * our command-line arguments placed in ZHYVE_CMD_FILE by the boot stub, do a + * little administration, and exec the real bhyve binary. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/corectl.h> + +#define ZHYVE_CMD_FILE "/var/run/bhyve/zhyve.cmd" + +/* + * Do a read of the specified size or return an error. Returns 0 on success + * and -1 on error. Sets errno to EINVAL if EOF is encountered. For other + * errors, see read(2). + */ +static int +full_read(int fd, char *buf, size_t len) +{ + ssize_t nread = 0; + size_t totread = 0; + + while (totread < len) { + nread = read(fd, buf + totread, len - totread); + if (nread == 0) { + errno = EINVAL; + return (-1); + } + if (nread < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + return (-1); + } + totread += nread; + } + assert(totread == len); + + return (0); +} + +/* + * Reads the command line options from the packed nvlist in the file referenced + * by path. On success, 0 is returned and the members of *argv reference memory + * allocated from an nvlist. On failure, -1 is returned. + */ + +static int +parse_options_file(const char *path, uint_t *argcp, char ***argvp) +{ + int fd = -1; + struct stat stbuf; + char *buf = NULL; + nvlist_t *nvl = NULL; + int ret; + + if ((fd = open(path, O_RDONLY)) < 0 || + fstat(fd, &stbuf) != 0 || + (buf = malloc(stbuf.st_size)) == NULL || + full_read(fd, buf, stbuf.st_size) != 0 || + nvlist_unpack(buf, stbuf.st_size, &nvl, 0) != 0 || + nvlist_lookup_string_array(nvl, "bhyve_args", argvp, argcp) != 0) { + nvlist_free(nvl); + ret = -1; + } else { + ret = 0; + } + + free(buf); + (void) close(fd); + + (void) printf("Configuration from %s:\n", path); + nvlist_print(stdout, nvl); + + return (ret); +} + +/* + * Setup to suppress core dumps within the zone. + */ +static void +config_core_dumps() +{ + (void) core_set_options(0x0); +} + +int +main(int argc, char **argv) +{ + char **tmpargs; + uint_t zargc; + char **zargv; + int fd; + + config_core_dumps(); + + fd = open("/dev/null", O_WRONLY); + assert(fd >= 0); + if (fd != STDIN_FILENO) { + (void) dup2(fd, STDIN_FILENO); + (void) close(fd); + } + + fd = open("/dev/zfd/1", O_WRONLY); + assert(fd >= 0); + if (fd != STDOUT_FILENO) { + (void) dup2(fd, STDOUT_FILENO); + (void) close(fd); + } + setvbuf(stdout, NULL, _IONBF, 0); + + fd = open("/dev/zfd/2", O_WRONLY); + assert(fd >= 0); + if (fd != STDERR_FILENO) { + (void) dup2(fd, STDERR_FILENO); + (void) close(fd); + } + setvbuf(stderr, NULL, _IONBF, 0); + + if (parse_options_file(ZHYVE_CMD_FILE, &zargc, &zargv) != 0) { + (void) fprintf(stderr, "%s: failed to parse %s: %s\n", + argv[0], ZHYVE_CMD_FILE, strerror(errno)); + return (EXIT_FAILURE); + } + + /* + * Annoyingly, we need a NULL at the end. + */ + + if ((tmpargs = malloc(sizeof (*zargv) * (zargc + 1))) == NULL) { + perror("malloc failed"); + return (EXIT_FAILURE); + } + + memcpy(tmpargs, zargv, sizeof (*zargv) * zargc); + tmpargs[zargc] = NULL; + + (void) execv("/usr/sbin/bhyve", tmpargs); + + perror("execv failed"); + return (EXIT_FAILURE); +} |