diff options
author | Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> | 2017-09-26 12:19:41 +0200 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2018-02-22 15:57:20 +0000 |
commit | 43f85cd4da7e7860e4d240f14e6b5dd45700c7b6 (patch) | |
tree | fcf7f982094418a90da65ed16d48689bbc72ca5b | |
parent | 44db5f1c904128c3fd7c7ec37e9d894a10e93f8c (diff) | |
download | illumos-joyent-43f85cd4da7e7860e4d240f14e6b5dd45700c7b6.tar.gz |
OS-6409 import Pluribus bhyve port
Authored by: Krupal Joshi <krupal.joshi@pluribusnetworks.com>
Contributed by: Pluribus Networks Inc.
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Mike Gerdts <mike.gerdts@joyent.com>
215 files changed, 52310 insertions, 2 deletions
diff --git a/usr/contrib/freebsd/amd64/machine/_types.h b/usr/contrib/freebsd/amd64/machine/_types.h new file mode 100644 index 0000000000..59994352b5 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/_types.h @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD: head/sys/amd64/include/_types.h 232261 2012-02-28 18:15:28Z tijl $ */ + +#include <x86/_types.h> diff --git a/usr/contrib/freebsd/amd64/machine/psl.h b/usr/contrib/freebsd/amd64/machine/psl.h new file mode 100644 index 0000000000..c660bfbab0 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/psl.h @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD: head/sys/amd64/include/psl.h 233204 2012-03-19 21:29:57Z tijl $ */ + +#include <x86/psl.h> diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h new file mode 100644 index 0000000000..41d4125cb9 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/specialreg.h @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */ + +#include <x86/specialreg.h> diff --git a/usr/contrib/freebsd/amd64/machine/timerreg.h b/usr/contrib/freebsd/amd64/machine/timerreg.h new file mode 100644 index 0000000000..bca7b4dd19 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/timerreg.h @@ -0,0 +1,54 @@ +/*- + * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/include/timerreg.h 177642 2008-03-26 20:09:21Z phk $ + */ + +/* + * The outputs of the three timers are connected as follows: + * + * timer 0 -> irq 0 + * timer 1 -> dma chan 0 (for dram refresh) + * timer 2 -> speaker (via keyboard controller) + * + * Timer 0 is used to call hardclock. + * Timer 2 is used to generate console beeps. + */ + +#ifndef _MACHINE_TIMERREG_H_ +#define _MACHINE_TIMERREG_H_ + +#ifdef _KERNEL + +#include <dev/ic/i8253reg.h> + +#define IO_TIMER1 0x40 /* 8253 Timer #1 */ +#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0) +#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1) +#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2) +#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE) + +#endif /* _KERNEL */ + +#endif /* _MACHINE_TIMERREG_H_ */ diff --git a/usr/contrib/freebsd/amd64/machine/vm.h b/usr/contrib/freebsd/amd64/machine/vm.h new file mode 100644 index 0000000000..885c1607ea --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/vm.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2009 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/include/vm.h 233671 2012-03-29 16:51:22Z jhb $ + */ + +#ifndef _MACHINE_VM_H_ +#define _MACHINE_VM_H_ + +#include <machine/specialreg.h> + +/* Memory attributes. */ +#define VM_MEMATTR_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHEABLE) +#define VM_MEMATTR_WRITE_COMBINING ((vm_memattr_t)PAT_WRITE_COMBINING) +#define VM_MEMATTR_WRITE_THROUGH ((vm_memattr_t)PAT_WRITE_THROUGH) +#define VM_MEMATTR_WRITE_PROTECTED ((vm_memattr_t)PAT_WRITE_PROTECTED) +#define VM_MEMATTR_WRITE_BACK ((vm_memattr_t)PAT_WRITE_BACK) +#define VM_MEMATTR_WEAK_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHED) + +#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK + +#endif /* !_MACHINE_VM_H_ */ diff --git a/usr/contrib/freebsd/dev/acpica/acpi_hpet.h b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h new file mode 100644 index 0000000000..df817b7a2b --- /dev/null +++ b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2005 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/dev/acpica/acpi_hpet.h 224919 2011-08-16 21:51:29Z mav $ + */ + +#ifndef __ACPI_HPET_H__ +#define __ACPI_HPET_H__ + +#define HPET_MEM_WIDTH 0x400 /* Expected memory region size */ + +/* General registers */ +#define HPET_CAPABILITIES 0x0 /* General capabilities and ID */ +#define HPET_CAP_VENDOR_ID 0xffff0000 +#define HPET_CAP_LEG_RT 0x00008000 +#define HPET_CAP_COUNT_SIZE 0x00002000 /* 1 = 64-bit, 0 = 32-bit */ +#define HPET_CAP_NUM_TIM 0x00001f00 +#define HPET_CAP_REV_ID 0x000000ff +#define HPET_PERIOD 0x4 /* Period (1/hz) of timer */ +#define HPET_CONFIG 0x10 /* General configuration register */ +#define HPET_CNF_LEG_RT 0x00000002 +#define HPET_CNF_ENABLE 0x00000001 +#define HPET_ISR 0x20 /* General interrupt status register */ +#define HPET_MAIN_COUNTER 0xf0 /* Main counter register */ + +/* Timer registers */ +#define HPET_TIMER_CAP_CNF(x) ((x) * 0x20 + 0x100) +#define HPET_TCAP_INT_ROUTE 0xffffffff00000000 +#define HPET_TCAP_FSB_INT_DEL 0x00008000 +#define HPET_TCNF_FSB_EN 0x00004000 +#define HPET_TCNF_INT_ROUTE 0x00003e00 +#define HPET_TCNF_32MODE 0x00000100 +#define HPET_TCNF_VAL_SET 0x00000040 +#define HPET_TCAP_SIZE 0x00000020 /* 1 = 64-bit, 0 = 32-bit */ +#define HPET_TCAP_PER_INT 0x00000010 /* Supports periodic interrupts */ +#define HPET_TCNF_TYPE 0x00000008 /* 1 = periodic, 0 = one-shot */ +#define HPET_TCNF_INT_ENB 0x00000004 +#define HPET_TCNF_INT_TYPE 0x00000002 /* 1 = level triggered, 0 = edge */ +#define HPET_TIMER_COMPARATOR(x) ((x) * 0x20 + 0x108) +#define HPET_TIMER_FSB_VAL(x) ((x) * 0x20 + 0x110) +#define HPET_TIMER_FSB_ADDR(x) ((x) * 0x20 + 0x114) + +#define HPET_MIN_CYCLES 128 /* Period considered reliable. */ + +#endif /* !__ACPI_HPET_H__ */ diff --git a/usr/contrib/freebsd/dev/ic/i8253reg.h b/usr/contrib/freebsd/dev/ic/i8253reg.h new file mode 100644 index 0000000000..47568b3436 --- /dev/null +++ b/usr/contrib/freebsd/dev/ic/i8253reg.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 1993 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Header: timerreg.h,v 1.2 93/02/28 15:08:58 mccanne Exp + * $FreeBSD: head/sys/dev/ic/i8253reg.h 146215 2005-05-14 10:26:31Z nyan $ + */ + +/* + * Register definitions for the Intel 8253 Programmable Interval Timer. + * + * This chip has three independent 16-bit down counters that can be + * read on the fly. There are three mode registers and three countdown + * registers. The countdown registers are addressed directly, via the + * first three I/O ports. The three mode registers are accessed via + * the fourth I/O port, with two bits in the mode byte indicating the + * register. (Why are hardware interfaces always so braindead?). + * + * To write a value into the countdown register, the mode register + * is first programmed with a command indicating the which byte of + * the two byte register is to be modified. The three possibilities + * are load msb (TMR_MR_MSB), load lsb (TMR_MR_LSB), or load lsb then + * msb (TMR_MR_BOTH). + * + * To read the current value ("on the fly") from the countdown register, + * you write a "latch" command into the mode register, then read the stable + * value from the corresponding I/O port. For example, you write + * TMR_MR_LATCH into the corresponding mode register. Presumably, + * after doing this, a write operation to the I/O port would result + * in undefined behavior (but hopefully not fry the chip). + * Reading in this manner has no side effects. + */ + +/* + * Macros for specifying values to be written into a mode register. + */ +#define TIMER_REG_CNTR0 0 /* timer 0 counter port */ +#define TIMER_REG_CNTR1 1 /* timer 1 counter port */ +#define TIMER_REG_CNTR2 2 /* timer 2 counter port */ +#define TIMER_REG_MODE 3 /* timer mode port */ +#define TIMER_SEL0 0x00 /* select counter 0 */ +#define TIMER_SEL1 0x40 /* select counter 1 */ +#define TIMER_SEL2 0x80 /* select counter 2 */ +#define TIMER_INTTC 0x00 /* mode 0, intr on terminal cnt */ +#define TIMER_ONESHOT 0x02 /* mode 1, one shot */ +#define TIMER_RATEGEN 0x04 /* mode 2, rate generator */ +#define TIMER_SQWAVE 0x06 /* mode 3, square wave */ +#define TIMER_SWSTROBE 0x08 /* mode 4, s/w triggered strobe */ +#define TIMER_HWSTROBE 0x0a /* mode 5, h/w triggered strobe */ +#define TIMER_LATCH 0x00 /* latch counter for reading */ +#define TIMER_LSB 0x10 /* r/w counter LSB */ +#define TIMER_MSB 0x20 /* r/w counter MSB */ +#define TIMER_16BIT 0x30 /* r/w counter 16 bits, LSB first */ +#define TIMER_BCD 0x01 /* count in BCD */ diff --git a/usr/contrib/freebsd/dev/ic/i8259.h b/usr/contrib/freebsd/dev/ic/i8259.h new file mode 100644 index 0000000000..be523c1df4 --- /dev/null +++ b/usr/contrib/freebsd/dev/ic/i8259.h @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2003 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/dev/ic/i8259.h 151580 2005-10-23 09:05:51Z glebius $ + */ + +/* + * Register defintions for the i8259A programmable interrupt controller. + */ + +#ifndef _DEV_IC_I8259_H_ +#define _DEV_IC_I8259_H_ + +/* Initialization control word 1. Written to even address. */ +#define ICW1_IC4 0x01 /* ICW4 present */ +#define ICW1_SNGL 0x02 /* 1 = single, 0 = cascaded */ +#define ICW1_ADI 0x04 /* 1 = 4, 0 = 8 byte vectors */ +#define ICW1_LTIM 0x08 /* 1 = level trigger, 0 = edge */ +#define ICW1_RESET 0x10 /* must be 1 */ +/* 0x20 - 0x80 - in 8080/8085 mode only */ + +/* Initialization control word 2. Written to the odd address. */ +/* No definitions, it is the base vector of the IDT for 8086 mode */ + +/* Initialization control word 3. Written to the odd address. */ +/* For a master PIC, bitfield indicating a slave 8259 on given input */ +/* For slave, lower 3 bits are the slave's ID binary id on master */ + +/* Initialization control word 4. Written to the odd address. */ +#define ICW4_8086 0x01 /* 1 = 8086, 0 = 8080 */ +#define ICW4_AEOI 0x02 /* 1 = Auto EOI */ +#define ICW4_MS 0x04 /* 1 = buffered master, 0 = slave */ +#define ICW4_BUF 0x08 /* 1 = enable buffer mode */ +#define ICW4_SFNM 0x10 /* 1 = special fully nested mode */ + +/* Operation control words. Written after initialization. */ + +/* Operation control word type 1 */ +/* + * No definitions. Written to the odd address. Bitmask for interrupts. + * 1 = disabled. + */ + +/* Operation control word type 2. Bit 3 (0x08) must be zero. Even address. */ +#define OCW2_L0 0x01 /* Level */ +#define OCW2_L1 0x02 +#define OCW2_L2 0x04 +/* 0x08 must be 0 to select OCW2 vs OCW3 */ +/* 0x10 must be 0 to select OCW2 vs ICW1 */ +#define OCW2_EOI 0x20 /* 1 = EOI */ +#define OCW2_SL 0x40 /* EOI mode */ +#define OCW2_R 0x80 /* EOI mode */ + +/* Operation control word type 3. Bit 3 (0x08) must be set. Even address. */ +#define OCW3_RIS 0x01 /* 1 = read IS, 0 = read IR */ +#define OCW3_RR 0x02 /* register read */ +#define OCW3_P 0x04 /* poll mode command */ +/* 0x08 must be 1 to select OCW3 vs OCW2 */ +#define OCW3_SEL 0x08 /* must be 1 */ +/* 0x10 must be 0 to select OCW3 vs ICW1 */ +#define OCW3_SMM 0x20 /* special mode mask */ +#define OCW3_ESMM 0x40 /* enable SMM */ + +#endif /* !_DEV_IC_I8259_H_ */ diff --git a/usr/contrib/freebsd/dev/ic/ns16550.h b/usr/contrib/freebsd/dev/ic/ns16550.h new file mode 100644 index 0000000000..5e8f30e3e8 --- /dev/null +++ b/usr/contrib/freebsd/dev/ic/ns16550.h @@ -0,0 +1,240 @@ +/*- + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ns16550.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD: head/sys/dev/ic/ns16550.h 257170 2013-10-26 17:24:59Z zbb $ + */ + +/* + * NS8250... UART registers. + */ + +/* 8250 registers #[0-6]. */ + +#define com_data 0 /* data register (R/W) */ +#define REG_DATA com_data + +#define com_ier 1 /* interrupt enable register (W) */ +#define REG_IER com_ier +#define IER_ERXRDY 0x1 +#define IER_ETXRDY 0x2 +#define IER_ERLS 0x4 +#define IER_EMSC 0x8 + +#define IER_BITS "\20\1ERXRDY\2ETXRDY\3ERLS\4EMSC" + +#define com_iir 2 /* interrupt identification register (R) */ +#define REG_IIR com_iir +#define IIR_IMASK 0xf +#define IIR_RXTOUT 0xc +#define IIR_BUSY 0x7 +#define IIR_RLS 0x6 +#define IIR_RXRDY 0x4 +#define IIR_TXRDY 0x2 +#define IIR_NOPEND 0x1 +#define IIR_MLSC 0x0 +#define IIR_FIFO_MASK 0xc0 /* set if FIFOs are enabled */ + +#define IIR_BITS "\20\1NOPEND\2TXRDY\3RXRDY" + +#define com_lcr 3 /* line control register (R/W) */ +#define com_cfcr com_lcr /* character format control register (R/W) */ +#define REG_LCR com_lcr +#define LCR_DLAB 0x80 +#define CFCR_DLAB LCR_DLAB +#define LCR_EFR_ENABLE 0xbf /* magic to enable EFR on 16650 up */ +#define CFCR_EFR_ENABLE LCR_EFR_ENABLE +#define LCR_SBREAK 0x40 +#define CFCR_SBREAK LCR_SBREAK +#define LCR_PZERO 0x30 +#define CFCR_PZERO LCR_PZERO +#define LCR_PONE 0x20 +#define CFCR_PONE LCR_PONE +#define LCR_PEVEN 0x10 +#define CFCR_PEVEN LCR_PEVEN +#define LCR_PODD 0x00 +#define CFCR_PODD LCR_PODD +#define LCR_PENAB 0x08 +#define CFCR_PENAB LCR_PENAB +#define LCR_STOPB 0x04 +#define CFCR_STOPB LCR_STOPB +#define LCR_8BITS 0x03 +#define CFCR_8BITS LCR_8BITS +#define LCR_7BITS 0x02 +#define CFCR_7BITS LCR_7BITS +#define LCR_6BITS 0x01 +#define CFCR_6BITS LCR_6BITS +#define LCR_5BITS 0x00 +#define CFCR_5BITS LCR_5BITS + +#define com_mcr 4 /* modem control register (R/W) */ +#define REG_MCR com_mcr +#define MCR_PRESCALE 0x80 /* only available on 16650 up */ +#define MCR_LOOPBACK 0x10 +#define MCR_IE 0x08 +#define MCR_IENABLE MCR_IE +#define MCR_DRS 0x04 +#define MCR_RTS 0x02 +#define MCR_DTR 0x01 + +#define MCR_BITS "\20\1DTR\2RTS\3DRS\4IE\5LOOPBACK\10PRESCALE" + +#define com_lsr 5 /* line status register (R/W) */ +#define REG_LSR com_lsr +#define LSR_RCV_FIFO 0x80 +#define LSR_TEMT 0x40 +#define LSR_TSRE LSR_TEMT +#define LSR_THRE 0x20 +#define LSR_TXRDY LSR_THRE +#define LSR_BI 0x10 +#define LSR_FE 0x08 +#define LSR_PE 0x04 +#define LSR_OE 0x02 +#define LSR_RXRDY 0x01 +#define LSR_RCV_MASK 0x1f + +#define LSR_BITS "\20\1RXRDY\2OE\3PE\4FE\5BI\6THRE\7TEMT\10RCV_FIFO" + +#define com_msr 6 /* modem status register (R/W) */ +#define REG_MSR com_msr +#define MSR_DCD 0x80 +#define MSR_RI 0x40 +#define MSR_DSR 0x20 +#define MSR_CTS 0x10 +#define MSR_DDCD 0x08 +#define MSR_TERI 0x04 +#define MSR_DDSR 0x02 +#define MSR_DCTS 0x01 + +#define MSR_BITS "\20\1DCTS\2DDSR\3TERI\4DDCD\5CTS\6DSR\7RI\10DCD" + +/* 8250 multiplexed registers #[0-1]. Access enabled by LCR[7]. */ +#define com_dll 0 /* divisor latch low (R/W) */ +#define com_dlbl com_dll +#define com_dlm 1 /* divisor latch high (R/W) */ +#define com_dlbh com_dlm +#define REG_DLL com_dll +#define REG_DLH com_dlm + +/* 16450 register #7. Not multiplexed. */ +#define com_scr 7 /* scratch register (R/W) */ + +/* 16550 register #2. Not multiplexed. */ +#define com_fcr 2 /* FIFO control register (W) */ +#define com_fifo com_fcr +#define REG_FCR com_fcr +#define FCR_ENABLE 0x01 +#define FIFO_ENABLE FCR_ENABLE +#define FCR_RCV_RST 0x02 +#define FIFO_RCV_RST FCR_RCV_RST +#define FCR_XMT_RST 0x04 +#define FIFO_XMT_RST FCR_XMT_RST +#define FCR_DMA 0x08 +#define FIFO_DMA_MODE FCR_DMA +#define FCR_RX_LOW 0x00 +#define FIFO_RX_LOW FCR_RX_LOW +#define FCR_RX_MEDL 0x40 +#define FIFO_RX_MEDL FCR_RX_MEDL +#define FCR_RX_MEDH 0x80 +#define FIFO_RX_MEDH FCR_RX_MEDH +#define FCR_RX_HIGH 0xc0 +#define FIFO_RX_HIGH FCR_RX_HIGH + +#define FCR_BITS "\20\1ENABLE\2RCV_RST\3XMT_RST\4DMA" + +/* 16650 registers #2,[4-7]. Access enabled by LCR_EFR_ENABLE. */ + +#define com_efr 2 /* enhanced features register (R/W) */ +#define REG_EFR com_efr +#define EFR_CTS 0x80 +#define EFR_AUTOCTS EFR_CTS +#define EFR_RTS 0x40 +#define EFR_AUTORTS EFR_RTS +#define EFR_EFE 0x10 /* enhanced functions enable */ + +#define com_xon1 4 /* XON 1 character (R/W) */ +#define com_xon2 5 /* XON 2 character (R/W) */ +#define com_xoff1 6 /* XOFF 1 character (R/W) */ +#define com_xoff2 7 /* XOFF 2 character (R/W) */ + +#define DW_REG_USR 31 /* DesignWare derived Uart Status Reg */ +#define com_usr 39 /* Octeon 16750/16550 Uart Status Reg */ +#define REG_USR com_usr +#define USR_BUSY 1 /* Uart Busy. Serial transfer in progress */ +#define USR_TXFIFO_NOTFULL 2 /* Uart TX FIFO Not full */ + +/* 16950 register #1. Access enabled by ACR[7]. Also requires !LCR[7]. */ +#define com_asr 1 /* additional status register (R[0-7]/W[0-1]) */ + +/* 16950 register #3. R/W access enabled by ACR[7]. */ +#define com_rfl 3 /* receiver fifo level (R) */ + +/* + * 16950 register #4. Access enabled by ACR[7]. Also requires + * !LCR_EFR_ENABLE. + */ +#define com_tfl 4 /* transmitter fifo level (R) */ + +/* + * 16950 register #5. Accessible if !LCR_EFR_ENABLE. Read access also + * requires ACR[6]. + */ +#define com_icr 5 /* index control register (R/W) */ + +/* + * 16950 register #7. It is the same as com_scr except it has a different + * abbreviation in the manufacturer's data sheet and it also serves as an + * index into the Indexed Control register set. + */ +#define com_spr com_scr /* scratch pad (and index) register (R/W) */ +#define REG_SPR com_scr + +/* + * 16950 indexed control registers #[0-0x13]. Access is via index in SPR, + * data in ICR (if ICR is accessible). + */ + +#define com_acr 0 /* additional control register (R/W) */ +#define ACR_ASE 0x80 /* ASR/RFL/TFL enable */ +#define ACR_ICRE 0x40 /* ICR enable */ +#define ACR_TLE 0x20 /* TTL/RTL enable */ + +#define com_cpr 1 /* clock prescaler register (R/W) */ +#define com_tcr 2 /* times clock register (R/W) */ +#define com_ttl 4 /* transmitter trigger level (R/W) */ +#define com_rtl 5 /* receiver trigger level (R/W) */ +/* ... */ + +/* Hardware extension mode register for RSB-2000/3000. */ +#define com_emr com_msr +#define EMR_EXBUFF 0x04 +#define EMR_CTSFLW 0x08 +#define EMR_DSRFLW 0x10 +#define EMR_RTSFLW 0x20 +#define EMR_DTRFLW 0x40 +#define EMR_EFMODE 0x80 diff --git a/usr/contrib/freebsd/dev/pci/pcireg.h b/usr/contrib/freebsd/dev/pci/pcireg.h new file mode 100644 index 0000000000..32a569dbd4 --- /dev/null +++ b/usr/contrib/freebsd/dev/pci/pcireg.h @@ -0,0 +1,922 @@ +/*- + * Copyright (c) 1997, Stefan Esser <se@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: head/sys/dev/pci/pcireg.h 266468 2014-05-20 14:39:22Z mav $ + * + */ + +/* + * PCIM_xxx: mask to locate subfield in register + * PCIR_xxx: config register offset + * PCIC_xxx: device class + * PCIS_xxx: device subclass + * PCIP_xxx: device programming interface + * PCIV_xxx: PCI vendor ID (only required to fixup ancient devices) + * PCID_xxx: device ID + * PCIY_xxx: capability identification number + * PCIZ_xxx: extended capability identification number + */ + +/* some PCI bus constants */ +#define PCI_DOMAINMAX 65535 /* highest supported domain number */ +#define PCI_BUSMAX 255 /* highest supported bus number */ +#define PCI_SLOTMAX 31 /* highest supported slot number */ +#define PCI_FUNCMAX 7 /* highest supported function number */ +#define PCI_REGMAX 255 /* highest supported config register addr. */ +#define PCIE_REGMAX 4095 /* highest supported config register addr. */ +#define PCI_MAXHDRTYPE 2 + +#define PCIE_ARI_SLOTMAX 0 +#define PCIE_ARI_FUNCMAX 255 + +#define PCI_RID_BUS_SHIFT 8 +#define PCI_RID_SLOT_SHIFT 3 +#define PCI_RID_FUNC_SHIFT 0 + +#define PCI_RID(bus, slot, func) \ + ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \ + (((slot) & PCI_SLOTMAX) << PCI_RID_SLOT_SHIFT) | \ + (((func) & PCI_FUNCMAX) << PCI_RID_FUNC_SHIFT)) + +#define PCI_ARI_RID(bus, func) \ + ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \ + (((func) & PCIE_ARI_FUNCMAX) << PCI_RID_FUNC_SHIFT)) + +#define PCI_RID2BUS(rid) (((rid) >> PCI_RID_BUS_SHIFT) & PCI_BUSMAX) +#define PCI_RID2SLOT(rid) (((rid) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX) +#define PCI_RID2FUNC(rid) (((rid) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX) + +#define PCIE_ARI_SLOT(func) (((func) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX) +#define PCIE_ARI_FUNC(func) (((func) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX) + +/* PCI config header registers for all devices */ + +#define PCIR_DEVVENDOR 0x00 +#define PCIR_VENDOR 0x00 +#define PCIR_DEVICE 0x02 +#define PCIR_COMMAND 0x04 +#define PCIM_CMD_PORTEN 0x0001 +#define PCIM_CMD_MEMEN 0x0002 +#define PCIM_CMD_BUSMASTEREN 0x0004 +#define PCIM_CMD_SPECIALEN 0x0008 +#define PCIM_CMD_MWRICEN 0x0010 +#define PCIM_CMD_PERRESPEN 0x0040 +#define PCIM_CMD_SERRESPEN 0x0100 +#define PCIM_CMD_BACKTOBACK 0x0200 +#define PCIM_CMD_INTxDIS 0x0400 +#define PCIR_STATUS 0x06 +#define PCIM_STATUS_INTxSTATE 0x0008 +#define PCIM_STATUS_CAPPRESENT 0x0010 +#define PCIM_STATUS_66CAPABLE 0x0020 +#define PCIM_STATUS_BACKTOBACK 0x0080 +#define PCIM_STATUS_MDPERR 0x0100 +#define PCIM_STATUS_SEL_FAST 0x0000 +#define PCIM_STATUS_SEL_MEDIMUM 0x0200 +#define PCIM_STATUS_SEL_SLOW 0x0400 +#define PCIM_STATUS_SEL_MASK 0x0600 +#define PCIM_STATUS_STABORT 0x0800 +#define PCIM_STATUS_RTABORT 0x1000 +#define PCIM_STATUS_RMABORT 0x2000 +#define PCIM_STATUS_SERR 0x4000 +#define PCIM_STATUS_PERR 0x8000 +#define PCIR_REVID 0x08 +#define PCIR_PROGIF 0x09 +#define PCIR_SUBCLASS 0x0a +#define PCIR_CLASS 0x0b +#define PCIR_CACHELNSZ 0x0c +#define PCIR_LATTIMER 0x0d +#define PCIR_HDRTYPE 0x0e +#define PCIM_HDRTYPE 0x7f +#define PCIM_HDRTYPE_NORMAL 0x00 +#define PCIM_HDRTYPE_BRIDGE 0x01 +#define PCIM_HDRTYPE_CARDBUS 0x02 +#define PCIM_MFDEV 0x80 +#define PCIR_BIST 0x0f + +/* Capability Register Offsets */ + +#define PCICAP_ID 0x0 +#define PCICAP_NEXTPTR 0x1 + +/* Capability Identification Numbers */ + +#define PCIY_PMG 0x01 /* PCI Power Management */ +#define PCIY_AGP 0x02 /* AGP */ +#define PCIY_VPD 0x03 /* Vital Product Data */ +#define PCIY_SLOTID 0x04 /* Slot Identification */ +#define PCIY_MSI 0x05 /* Message Signaled Interrupts */ +#define PCIY_CHSWP 0x06 /* CompactPCI Hot Swap */ +#define PCIY_PCIX 0x07 /* PCI-X */ +#define PCIY_HT 0x08 /* HyperTransport */ +#define PCIY_VENDOR 0x09 /* Vendor Unique */ +#define PCIY_DEBUG 0x0a /* Debug port */ +#define PCIY_CRES 0x0b /* CompactPCI central resource control */ +#define PCIY_HOTPLUG 0x0c /* PCI Hot-Plug */ +#define PCIY_SUBVENDOR 0x0d /* PCI-PCI bridge subvendor ID */ +#define PCIY_AGP8X 0x0e /* AGP 8x */ +#define PCIY_SECDEV 0x0f /* Secure Device */ +#define PCIY_EXPRESS 0x10 /* PCI Express */ +#define PCIY_MSIX 0x11 /* MSI-X */ +#define PCIY_SATA 0x12 /* SATA */ +#define PCIY_PCIAF 0x13 /* PCI Advanced Features */ + +/* Extended Capability Register Fields */ + +#define PCIR_EXTCAP 0x100 +#define PCIM_EXTCAP_ID 0x0000ffff +#define PCIM_EXTCAP_VER 0x000f0000 +#define PCIM_EXTCAP_NEXTPTR 0xfff00000 +#define PCI_EXTCAP_ID(ecap) ((ecap) & PCIM_EXTCAP_ID) +#define PCI_EXTCAP_VER(ecap) (((ecap) & PCIM_EXTCAP_VER) >> 16) +#define PCI_EXTCAP_NEXTPTR(ecap) (((ecap) & PCIM_EXTCAP_NEXTPTR) >> 20) + +/* Extended Capability Identification Numbers */ + +#define PCIZ_AER 0x0001 /* Advanced Error Reporting */ +#define PCIZ_VC 0x0002 /* Virtual Channel if MFVC Ext Cap not set */ +#define PCIZ_SERNUM 0x0003 /* Device Serial Number */ +#define PCIZ_PWRBDGT 0x0004 /* Power Budgeting */ +#define PCIZ_RCLINK_DCL 0x0005 /* Root Complex Link Declaration */ +#define PCIZ_RCLINK_CTL 0x0006 /* Root Complex Internal Link Control */ +#define PCIZ_RCEC_ASSOC 0x0007 /* Root Complex Event Collector Association */ +#define PCIZ_MFVC 0x0008 /* Multi-Function Virtual Channel */ +#define PCIZ_VC2 0x0009 /* Virtual Channel if MFVC Ext Cap set */ +#define PCIZ_RCRB 0x000a /* RCRB Header */ +#define PCIZ_VENDOR 0x000b /* Vendor Unique */ +#define PCIZ_CAC 0x000c /* Configuration Access Correction -- obsolete */ +#define PCIZ_ACS 0x000d /* Access Control Services */ +#define PCIZ_ARI 0x000e /* Alternative Routing-ID Interpretation */ +#define PCIZ_ATS 0x000f /* Address Translation Services */ +#define PCIZ_SRIOV 0x0010 /* Single Root IO Virtualization */ +#define PCIZ_MRIOV 0x0011 /* Multiple Root IO Virtualization */ +#define PCIZ_MULTICAST 0x0012 /* Multicast */ +#define PCIZ_PAGE_REQ 0x0013 /* Page Request */ +#define PCIZ_AMD 0x0014 /* Reserved for AMD */ +#define PCIZ_RESIZE_BAR 0x0015 /* Resizable BAR */ +#define PCIZ_DPA 0x0016 /* Dynamic Power Allocation */ +#define PCIZ_TPH_REQ 0x0017 /* TPH Requester */ +#define PCIZ_LTR 0x0018 /* Latency Tolerance Reporting */ +#define PCIZ_SEC_PCIE 0x0019 /* Secondary PCI Express */ +#define PCIZ_PMUX 0x001a /* Protocol Multiplexing */ +#define PCIZ_PASID 0x001b /* Process Address Space ID */ +#define PCIZ_LN_REQ 0x001c /* LN Requester */ +#define PCIZ_DPC 0x001d /* Downstream Porto Containment */ +#define PCIZ_L1PM 0x001e /* L1 PM Substates */ + +/* config registers for header type 0 devices */ + +#define PCIR_BARS 0x10 +#define PCIR_BAR(x) (PCIR_BARS + (x) * 4) +#define PCIR_MAX_BAR_0 5 +#define PCI_RID2BAR(rid) (((rid) - PCIR_BARS) / 4) +#define PCI_BAR_IO(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_IO_SPACE) +#define PCI_BAR_MEM(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_MEM_SPACE) +#define PCIM_BAR_SPACE 0x00000001 +#define PCIM_BAR_MEM_SPACE 0 +#define PCIM_BAR_IO_SPACE 1 +#define PCIM_BAR_MEM_TYPE 0x00000006 +#define PCIM_BAR_MEM_32 0 +#define PCIM_BAR_MEM_1MB 2 /* Locate below 1MB in PCI <= 2.1 */ +#define PCIM_BAR_MEM_64 4 +#define PCIM_BAR_MEM_PREFETCH 0x00000008 +#define PCIM_BAR_MEM_BASE 0xfffffffffffffff0ULL +#define PCIM_BAR_IO_RESERVED 0x00000002 +#define PCIM_BAR_IO_BASE 0xfffffffc +#define PCIR_CIS 0x28 +#define PCIM_CIS_ASI_MASK 0x00000007 +#define PCIM_CIS_ASI_CONFIG 0 +#define PCIM_CIS_ASI_BAR0 1 +#define PCIM_CIS_ASI_BAR1 2 +#define PCIM_CIS_ASI_BAR2 3 +#define PCIM_CIS_ASI_BAR3 4 +#define PCIM_CIS_ASI_BAR4 5 +#define PCIM_CIS_ASI_BAR5 6 +#define PCIM_CIS_ASI_ROM 7 +#define PCIM_CIS_ADDR_MASK 0x0ffffff8 +#define PCIM_CIS_ROM_MASK 0xf0000000 +#define PCIM_CIS_CONFIG_MASK 0xff +#define PCIR_SUBVEND_0 0x2c +#define PCIR_SUBDEV_0 0x2e +#define PCIR_BIOS 0x30 +#define PCIM_BIOS_ENABLE 0x01 +#define PCIM_BIOS_ADDR_MASK 0xfffff800 +#define PCIR_CAP_PTR 0x34 +#define PCIR_INTLINE 0x3c +#define PCIR_INTPIN 0x3d +#define PCIR_MINGNT 0x3e +#define PCIR_MAXLAT 0x3f + +/* config registers for header type 1 (PCI-to-PCI bridge) devices */ + +#define PCIR_MAX_BAR_1 1 +#define PCIR_SECSTAT_1 0x1e + +#define PCIR_PRIBUS_1 0x18 +#define PCIR_SECBUS_1 0x19 +#define PCIR_SUBBUS_1 0x1a +#define PCIR_SECLAT_1 0x1b + +#define PCIR_IOBASEL_1 0x1c +#define PCIR_IOLIMITL_1 0x1d +#define PCIR_IOBASEH_1 0x30 +#define PCIR_IOLIMITH_1 0x32 +#define PCIM_BRIO_16 0x0 +#define PCIM_BRIO_32 0x1 +#define PCIM_BRIO_MASK 0xf + +#define PCIR_MEMBASE_1 0x20 +#define PCIR_MEMLIMIT_1 0x22 + +#define PCIR_PMBASEL_1 0x24 +#define PCIR_PMLIMITL_1 0x26 +#define PCIR_PMBASEH_1 0x28 +#define PCIR_PMLIMITH_1 0x2c +#define PCIM_BRPM_32 0x0 +#define PCIM_BRPM_64 0x1 +#define PCIM_BRPM_MASK 0xf + +#define PCIR_BIOS_1 0x38 +#define PCIR_BRIDGECTL_1 0x3e + +/* config registers for header type 2 (CardBus) devices */ + +#define PCIR_MAX_BAR_2 0 +#define PCIR_CAP_PTR_2 0x14 +#define PCIR_SECSTAT_2 0x16 + +#define PCIR_PRIBUS_2 0x18 +#define PCIR_SECBUS_2 0x19 +#define PCIR_SUBBUS_2 0x1a +#define PCIR_SECLAT_2 0x1b + +#define PCIR_MEMBASE0_2 0x1c +#define PCIR_MEMLIMIT0_2 0x20 +#define PCIR_MEMBASE1_2 0x24 +#define PCIR_MEMLIMIT1_2 0x28 +#define PCIR_IOBASE0_2 0x2c +#define PCIR_IOLIMIT0_2 0x30 +#define PCIR_IOBASE1_2 0x34 +#define PCIR_IOLIMIT1_2 0x38 + +#define PCIR_BRIDGECTL_2 0x3e + +#define PCIR_SUBVEND_2 0x40 +#define PCIR_SUBDEV_2 0x42 + +#define PCIR_PCCARDIF_2 0x44 + +/* PCI device class, subclass and programming interface definitions */ + +#define PCIC_OLD 0x00 +#define PCIS_OLD_NONVGA 0x00 +#define PCIS_OLD_VGA 0x01 + +#define PCIC_STORAGE 0x01 +#define PCIS_STORAGE_SCSI 0x00 +#define PCIS_STORAGE_IDE 0x01 +#define PCIP_STORAGE_IDE_MODEPRIM 0x01 +#define PCIP_STORAGE_IDE_PROGINDPRIM 0x02 +#define PCIP_STORAGE_IDE_MODESEC 0x04 +#define PCIP_STORAGE_IDE_PROGINDSEC 0x08 +#define PCIP_STORAGE_IDE_MASTERDEV 0x80 +#define PCIS_STORAGE_FLOPPY 0x02 +#define PCIS_STORAGE_IPI 0x03 +#define PCIS_STORAGE_RAID 0x04 +#define PCIS_STORAGE_ATA_ADMA 0x05 +#define PCIS_STORAGE_SATA 0x06 +#define PCIP_STORAGE_SATA_AHCI_1_0 0x01 +#define PCIS_STORAGE_SAS 0x07 +#define PCIS_STORAGE_NVM 0x08 +#define PCIP_STORAGE_NVM_NVMHCI_1_0 0x01 +#define PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0 0x02 +#define PCIS_STORAGE_OTHER 0x80 + +#define PCIC_NETWORK 0x02 +#define PCIS_NETWORK_ETHERNET 0x00 +#define PCIS_NETWORK_TOKENRING 0x01 +#define PCIS_NETWORK_FDDI 0x02 +#define PCIS_NETWORK_ATM 0x03 +#define PCIS_NETWORK_ISDN 0x04 +#define PCIS_NETWORK_WORLDFIP 0x05 +#define PCIS_NETWORK_PICMG 0x06 +#define PCIS_NETWORK_OTHER 0x80 + +#define PCIC_DISPLAY 0x03 +#define PCIS_DISPLAY_VGA 0x00 +#define PCIS_DISPLAY_XGA 0x01 +#define PCIS_DISPLAY_3D 0x02 +#define PCIS_DISPLAY_OTHER 0x80 + +#define PCIC_MULTIMEDIA 0x04 +#define PCIS_MULTIMEDIA_VIDEO 0x00 +#define PCIS_MULTIMEDIA_AUDIO 0x01 +#define PCIS_MULTIMEDIA_TELE 0x02 +#define PCIS_MULTIMEDIA_HDA 0x03 +#define PCIS_MULTIMEDIA_OTHER 0x80 + +#define PCIC_MEMORY 0x05 +#define PCIS_MEMORY_RAM 0x00 +#define PCIS_MEMORY_FLASH 0x01 +#define PCIS_MEMORY_OTHER 0x80 + +#define PCIC_BRIDGE 0x06 +#define PCIS_BRIDGE_HOST 0x00 +#define PCIS_BRIDGE_ISA 0x01 +#define PCIS_BRIDGE_EISA 0x02 +#define PCIS_BRIDGE_MCA 0x03 +#define PCIS_BRIDGE_PCI 0x04 +#define PCIP_BRIDGE_PCI_SUBTRACTIVE 0x01 +#define PCIS_BRIDGE_PCMCIA 0x05 +#define PCIS_BRIDGE_NUBUS 0x06 +#define PCIS_BRIDGE_CARDBUS 0x07 +#define PCIS_BRIDGE_RACEWAY 0x08 +#define PCIS_BRIDGE_PCI_TRANSPARENT 0x09 +#define PCIS_BRIDGE_INFINIBAND 0x0a +#define PCIS_BRIDGE_OTHER 0x80 + +#define PCIC_SIMPLECOMM 0x07 +#define PCIS_SIMPLECOMM_UART 0x00 +#define PCIP_SIMPLECOMM_UART_8250 0x00 +#define PCIP_SIMPLECOMM_UART_16450A 0x01 +#define PCIP_SIMPLECOMM_UART_16550A 0x02 +#define PCIP_SIMPLECOMM_UART_16650A 0x03 +#define PCIP_SIMPLECOMM_UART_16750A 0x04 +#define PCIP_SIMPLECOMM_UART_16850A 0x05 +#define PCIP_SIMPLECOMM_UART_16950A 0x06 +#define PCIS_SIMPLECOMM_PAR 0x01 +#define PCIS_SIMPLECOMM_MULSER 0x02 +#define PCIS_SIMPLECOMM_MODEM 0x03 +#define PCIS_SIMPLECOMM_GPIB 0x04 +#define PCIS_SIMPLECOMM_SMART_CARD 0x05 +#define PCIS_SIMPLECOMM_OTHER 0x80 + +#define PCIC_BASEPERIPH 0x08 +#define PCIS_BASEPERIPH_PIC 0x00 +#define PCIP_BASEPERIPH_PIC_8259A 0x00 +#define PCIP_BASEPERIPH_PIC_ISA 0x01 +#define PCIP_BASEPERIPH_PIC_EISA 0x02 +#define PCIP_BASEPERIPH_PIC_IO_APIC 0x10 +#define PCIP_BASEPERIPH_PIC_IOX_APIC 0x20 +#define PCIS_BASEPERIPH_DMA 0x01 +#define PCIS_BASEPERIPH_TIMER 0x02 +#define PCIS_BASEPERIPH_RTC 0x03 +#define PCIS_BASEPERIPH_PCIHOT 0x04 +#define PCIS_BASEPERIPH_SDHC 0x05 +#define PCIS_BASEPERIPH_IOMMU 0x06 +#define PCIS_BASEPERIPH_OTHER 0x80 + +#define PCIC_INPUTDEV 0x09 +#define PCIS_INPUTDEV_KEYBOARD 0x00 +#define PCIS_INPUTDEV_DIGITIZER 0x01 +#define PCIS_INPUTDEV_MOUSE 0x02 +#define PCIS_INPUTDEV_SCANNER 0x03 +#define PCIS_INPUTDEV_GAMEPORT 0x04 +#define PCIS_INPUTDEV_OTHER 0x80 + +#define PCIC_DOCKING 0x0a +#define PCIS_DOCKING_GENERIC 0x00 +#define PCIS_DOCKING_OTHER 0x80 + +#define PCIC_PROCESSOR 0x0b +#define PCIS_PROCESSOR_386 0x00 +#define PCIS_PROCESSOR_486 0x01 +#define PCIS_PROCESSOR_PENTIUM 0x02 +#define PCIS_PROCESSOR_ALPHA 0x10 +#define PCIS_PROCESSOR_POWERPC 0x20 +#define PCIS_PROCESSOR_MIPS 0x30 +#define PCIS_PROCESSOR_COPROC 0x40 + +#define PCIC_SERIALBUS 0x0c +#define PCIS_SERIALBUS_FW 0x00 +#define PCIS_SERIALBUS_ACCESS 0x01 +#define PCIS_SERIALBUS_SSA 0x02 +#define PCIS_SERIALBUS_USB 0x03 +#define PCIP_SERIALBUS_USB_UHCI 0x00 +#define PCIP_SERIALBUS_USB_OHCI 0x10 +#define PCIP_SERIALBUS_USB_EHCI 0x20 +#define PCIP_SERIALBUS_USB_XHCI 0x30 +#define PCIP_SERIALBUS_USB_DEVICE 0xfe +#define PCIS_SERIALBUS_FC 0x04 +#define PCIS_SERIALBUS_SMBUS 0x05 +#define PCIS_SERIALBUS_INFINIBAND 0x06 +#define PCIS_SERIALBUS_IPMI 0x07 +#define PCIP_SERIALBUS_IPMI_SMIC 0x00 +#define PCIP_SERIALBUS_IPMI_KCS 0x01 +#define PCIP_SERIALBUS_IPMI_BT 0x02 +#define PCIS_SERIALBUS_SERCOS 0x08 +#define PCIS_SERIALBUS_CANBUS 0x09 + +#define PCIC_WIRELESS 0x0d +#define PCIS_WIRELESS_IRDA 0x00 +#define PCIS_WIRELESS_IR 0x01 +#define PCIS_WIRELESS_RF 0x10 +#define PCIS_WIRELESS_BLUETOOTH 0x11 +#define PCIS_WIRELESS_BROADBAND 0x12 +#define PCIS_WIRELESS_80211A 0x20 +#define PCIS_WIRELESS_80211B 0x21 +#define PCIS_WIRELESS_OTHER 0x80 + +#define PCIC_INTELLIIO 0x0e +#define PCIS_INTELLIIO_I2O 0x00 + +#define PCIC_SATCOM 0x0f +#define PCIS_SATCOM_TV 0x01 +#define PCIS_SATCOM_AUDIO 0x02 +#define PCIS_SATCOM_VOICE 0x03 +#define PCIS_SATCOM_DATA 0x04 + +#define PCIC_CRYPTO 0x10 +#define PCIS_CRYPTO_NETCOMP 0x00 +#define PCIS_CRYPTO_ENTERTAIN 0x10 +#define PCIS_CRYPTO_OTHER 0x80 + +#define PCIC_DASP 0x11 +#define PCIS_DASP_DPIO 0x00 +#define PCIS_DASP_PERFCNTRS 0x01 +#define PCIS_DASP_COMM_SYNC 0x10 +#define PCIS_DASP_MGMT_CARD 0x20 +#define PCIS_DASP_OTHER 0x80 + +#define PCIC_OTHER 0xff + +/* Bridge Control Values. */ +#define PCIB_BCR_PERR_ENABLE 0x0001 +#define PCIB_BCR_SERR_ENABLE 0x0002 +#define PCIB_BCR_ISA_ENABLE 0x0004 +#define PCIB_BCR_VGA_ENABLE 0x0008 +#define PCIB_BCR_MASTER_ABORT_MODE 0x0020 +#define PCIB_BCR_SECBUS_RESET 0x0040 +#define PCIB_BCR_SECBUS_BACKTOBACK 0x0080 +#define PCIB_BCR_PRI_DISCARD_TIMEOUT 0x0100 +#define PCIB_BCR_SEC_DISCARD_TIMEOUT 0x0200 +#define PCIB_BCR_DISCARD_TIMER_STATUS 0x0400 +#define PCIB_BCR_DISCARD_TIMER_SERREN 0x0800 + +/* PCI power manangement */ +#define PCIR_POWER_CAP 0x2 +#define PCIM_PCAP_SPEC 0x0007 +#define PCIM_PCAP_PMEREQCLK 0x0008 +#define PCIM_PCAP_DEVSPECINIT 0x0020 +#define PCIM_PCAP_AUXPWR_0 0x0000 +#define PCIM_PCAP_AUXPWR_55 0x0040 +#define PCIM_PCAP_AUXPWR_100 0x0080 +#define PCIM_PCAP_AUXPWR_160 0x00c0 +#define PCIM_PCAP_AUXPWR_220 0x0100 +#define PCIM_PCAP_AUXPWR_270 0x0140 +#define PCIM_PCAP_AUXPWR_320 0x0180 +#define PCIM_PCAP_AUXPWR_375 0x01c0 +#define PCIM_PCAP_AUXPWRMASK 0x01c0 +#define PCIM_PCAP_D1SUPP 0x0200 +#define PCIM_PCAP_D2SUPP 0x0400 +#define PCIM_PCAP_D0PME 0x0800 +#define PCIM_PCAP_D1PME 0x1000 +#define PCIM_PCAP_D2PME 0x2000 +#define PCIM_PCAP_D3PME_HOT 0x4000 +#define PCIM_PCAP_D3PME_COLD 0x8000 + +#define PCIR_POWER_STATUS 0x4 +#define PCIM_PSTAT_D0 0x0000 +#define PCIM_PSTAT_D1 0x0001 +#define PCIM_PSTAT_D2 0x0002 +#define PCIM_PSTAT_D3 0x0003 +#define PCIM_PSTAT_DMASK 0x0003 +#define PCIM_PSTAT_NOSOFTRESET 0x0008 +#define PCIM_PSTAT_PMEENABLE 0x0100 +#define PCIM_PSTAT_D0POWER 0x0000 +#define PCIM_PSTAT_D1POWER 0x0200 +#define PCIM_PSTAT_D2POWER 0x0400 +#define PCIM_PSTAT_D3POWER 0x0600 +#define PCIM_PSTAT_D0HEAT 0x0800 +#define PCIM_PSTAT_D1HEAT 0x0a00 +#define PCIM_PSTAT_D2HEAT 0x0c00 +#define PCIM_PSTAT_D3HEAT 0x0e00 +#define PCIM_PSTAT_DATASELMASK 0x1e00 +#define PCIM_PSTAT_DATAUNKN 0x0000 +#define PCIM_PSTAT_DATADIV10 0x2000 +#define PCIM_PSTAT_DATADIV100 0x4000 +#define PCIM_PSTAT_DATADIV1000 0x6000 +#define PCIM_PSTAT_DATADIVMASK 0x6000 +#define PCIM_PSTAT_PME 0x8000 + +#define PCIR_POWER_BSE 0x6 +#define PCIM_PMCSR_BSE_D3B3 0x00 +#define PCIM_PMCSR_BSE_D3B2 0x40 +#define PCIM_PMCSR_BSE_BPCCE 0x80 + +#define PCIR_POWER_DATA 0x7 + +/* VPD capability registers */ +#define PCIR_VPD_ADDR 0x2 +#define PCIR_VPD_DATA 0x4 + +/* PCI Message Signalled Interrupts (MSI) */ +#define PCIR_MSI_CTRL 0x2 +#define PCIM_MSICTRL_VECTOR 0x0100 +#define PCIM_MSICTRL_64BIT 0x0080 +#define PCIM_MSICTRL_MME_MASK 0x0070 +#define PCIM_MSICTRL_MME_1 0x0000 +#define PCIM_MSICTRL_MME_2 0x0010 +#define PCIM_MSICTRL_MME_4 0x0020 +#define PCIM_MSICTRL_MME_8 0x0030 +#define PCIM_MSICTRL_MME_16 0x0040 +#define PCIM_MSICTRL_MME_32 0x0050 +#define PCIM_MSICTRL_MMC_MASK 0x000E +#define PCIM_MSICTRL_MMC_1 0x0000 +#define PCIM_MSICTRL_MMC_2 0x0002 +#define PCIM_MSICTRL_MMC_4 0x0004 +#define PCIM_MSICTRL_MMC_8 0x0006 +#define PCIM_MSICTRL_MMC_16 0x0008 +#define PCIM_MSICTRL_MMC_32 0x000A +#define PCIM_MSICTRL_MSI_ENABLE 0x0001 +#define PCIR_MSI_ADDR 0x4 +#define PCIR_MSI_ADDR_HIGH 0x8 +#define PCIR_MSI_DATA 0x8 +#define PCIR_MSI_DATA_64BIT 0xc +#define PCIR_MSI_MASK 0x10 +#define PCIR_MSI_PENDING 0x14 + +/* PCI-X definitions */ + +/* For header type 0 devices */ +#define PCIXR_COMMAND 0x2 +#define PCIXM_COMMAND_DPERR_E 0x0001 /* Data Parity Error Recovery */ +#define PCIXM_COMMAND_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCIXM_COMMAND_MAX_READ 0x000c /* Maximum Burst Read Count */ +#define PCIXM_COMMAND_MAX_READ_512 0x0000 +#define PCIXM_COMMAND_MAX_READ_1024 0x0004 +#define PCIXM_COMMAND_MAX_READ_2048 0x0008 +#define PCIXM_COMMAND_MAX_READ_4096 0x000c +#define PCIXM_COMMAND_MAX_SPLITS 0x0070 /* Maximum Split Transactions */ +#define PCIXM_COMMAND_MAX_SPLITS_1 0x0000 +#define PCIXM_COMMAND_MAX_SPLITS_2 0x0010 +#define PCIXM_COMMAND_MAX_SPLITS_3 0x0020 +#define PCIXM_COMMAND_MAX_SPLITS_4 0x0030 +#define PCIXM_COMMAND_MAX_SPLITS_8 0x0040 +#define PCIXM_COMMAND_MAX_SPLITS_12 0x0050 +#define PCIXM_COMMAND_MAX_SPLITS_16 0x0060 +#define PCIXM_COMMAND_MAX_SPLITS_32 0x0070 +#define PCIXM_COMMAND_VERSION 0x3000 +#define PCIXR_STATUS 0x4 +#define PCIXM_STATUS_DEVFN 0x000000FF +#define PCIXM_STATUS_BUS 0x0000FF00 +#define PCIXM_STATUS_64BIT 0x00010000 +#define PCIXM_STATUS_133CAP 0x00020000 +#define PCIXM_STATUS_SC_DISCARDED 0x00040000 +#define PCIXM_STATUS_UNEXP_SC 0x00080000 +#define PCIXM_STATUS_COMPLEX_DEV 0x00100000 +#define PCIXM_STATUS_MAX_READ 0x00600000 +#define PCIXM_STATUS_MAX_READ_512 0x00000000 +#define PCIXM_STATUS_MAX_READ_1024 0x00200000 +#define PCIXM_STATUS_MAX_READ_2048 0x00400000 +#define PCIXM_STATUS_MAX_READ_4096 0x00600000 +#define PCIXM_STATUS_MAX_SPLITS 0x03800000 +#define PCIXM_STATUS_MAX_SPLITS_1 0x00000000 +#define PCIXM_STATUS_MAX_SPLITS_2 0x00800000 +#define PCIXM_STATUS_MAX_SPLITS_3 0x01000000 +#define PCIXM_STATUS_MAX_SPLITS_4 0x01800000 +#define PCIXM_STATUS_MAX_SPLITS_8 0x02000000 +#define PCIXM_STATUS_MAX_SPLITS_12 0x02800000 +#define PCIXM_STATUS_MAX_SPLITS_16 0x03000000 +#define PCIXM_STATUS_MAX_SPLITS_32 0x03800000 +#define PCIXM_STATUS_MAX_CUM_READ 0x1C000000 +#define PCIXM_STATUS_RCVD_SC_ERR 0x20000000 +#define PCIXM_STATUS_266CAP 0x40000000 +#define PCIXM_STATUS_533CAP 0x80000000 + +/* For header type 1 devices (PCI-X bridges) */ +#define PCIXR_SEC_STATUS 0x2 +#define PCIXM_SEC_STATUS_64BIT 0x0001 +#define PCIXM_SEC_STATUS_133CAP 0x0002 +#define PCIXM_SEC_STATUS_SC_DISC 0x0004 +#define PCIXM_SEC_STATUS_UNEXP_SC 0x0008 +#define PCIXM_SEC_STATUS_SC_OVERRUN 0x0010 +#define PCIXM_SEC_STATUS_SR_DELAYED 0x0020 +#define PCIXM_SEC_STATUS_BUS_MODE 0x03c0 +#define PCIXM_SEC_STATUS_VERSION 0x3000 +#define PCIXM_SEC_STATUS_266CAP 0x4000 +#define PCIXM_SEC_STATUS_533CAP 0x8000 +#define PCIXR_BRIDGE_STATUS 0x4 +#define PCIXM_BRIDGE_STATUS_DEVFN 0x000000FF +#define PCIXM_BRIDGE_STATUS_BUS 0x0000FF00 +#define PCIXM_BRIDGE_STATUS_64BIT 0x00010000 +#define PCIXM_BRIDGE_STATUS_133CAP 0x00020000 +#define PCIXM_BRIDGE_STATUS_SC_DISCARDED 0x00040000 +#define PCIXM_BRIDGE_STATUS_UNEXP_SC 0x00080000 +#define PCIXM_BRIDGE_STATUS_SC_OVERRUN 0x00100000 +#define PCIXM_BRIDGE_STATUS_SR_DELAYED 0x00200000 +#define PCIXM_BRIDGE_STATUS_DEVID_MSGCAP 0x20000000 +#define PCIXM_BRIDGE_STATUS_266CAP 0x40000000 +#define PCIXM_BRIDGE_STATUS_533CAP 0x80000000 + +/* HT (HyperTransport) Capability definitions */ +#define PCIR_HT_COMMAND 0x2 +#define PCIM_HTCMD_CAP_MASK 0xf800 /* Capability type. */ +#define PCIM_HTCAP_SLAVE 0x0000 /* 000xx */ +#define PCIM_HTCAP_HOST 0x2000 /* 001xx */ +#define PCIM_HTCAP_SWITCH 0x4000 /* 01000 */ +#define PCIM_HTCAP_INTERRUPT 0x8000 /* 10000 */ +#define PCIM_HTCAP_REVISION_ID 0x8800 /* 10001 */ +#define PCIM_HTCAP_UNITID_CLUMPING 0x9000 /* 10010 */ +#define PCIM_HTCAP_EXT_CONFIG_SPACE 0x9800 /* 10011 */ +#define PCIM_HTCAP_ADDRESS_MAPPING 0xa000 /* 10100 */ +#define PCIM_HTCAP_MSI_MAPPING 0xa800 /* 10101 */ +#define PCIM_HTCAP_DIRECT_ROUTE 0xb000 /* 10110 */ +#define PCIM_HTCAP_VCSET 0xb800 /* 10111 */ +#define PCIM_HTCAP_RETRY_MODE 0xc000 /* 11000 */ +#define PCIM_HTCAP_X86_ENCODING 0xc800 /* 11001 */ +#define PCIM_HTCAP_GEN3 0xd000 /* 11010 */ +#define PCIM_HTCAP_FLE 0xd800 /* 11011 */ +#define PCIM_HTCAP_PM 0xe000 /* 11100 */ +#define PCIM_HTCAP_HIGH_NODE_COUNT 0xe800 /* 11101 */ + +/* HT MSI Mapping Capability definitions. */ +#define PCIM_HTCMD_MSI_ENABLE 0x0001 +#define PCIM_HTCMD_MSI_FIXED 0x0002 +#define PCIR_HTMSI_ADDRESS_LO 0x4 +#define PCIR_HTMSI_ADDRESS_HI 0x8 + +/* PCI Vendor capability definitions */ +#define PCIR_VENDOR_LENGTH 0x2 +#define PCIR_VENDOR_DATA 0x3 + +/* PCI EHCI Debug Port definitions */ +#define PCIR_DEBUG_PORT 0x2 +#define PCIM_DEBUG_PORT_OFFSET 0x1FFF +#define PCIM_DEBUG_PORT_BAR 0xe000 + +/* PCI-PCI Bridge Subvendor definitions */ +#define PCIR_SUBVENDCAP_ID 0x4 + +/* PCI Express definitions */ +#define PCIER_FLAGS 0x2 +#define PCIEM_FLAGS_VERSION 0x000F +#define PCIEM_FLAGS_TYPE 0x00F0 +#define PCIEM_TYPE_ENDPOINT 0x0000 +#define PCIEM_TYPE_LEGACY_ENDPOINT 0x0010 +#define PCIEM_TYPE_ROOT_PORT 0x0040 +#define PCIEM_TYPE_UPSTREAM_PORT 0x0050 +#define PCIEM_TYPE_DOWNSTREAM_PORT 0x0060 +#define PCIEM_TYPE_PCI_BRIDGE 0x0070 +#define PCIEM_TYPE_PCIE_BRIDGE 0x0080 +#define PCIEM_TYPE_ROOT_INT_EP 0x0090 +#define PCIEM_TYPE_ROOT_EC 0x00a0 +#define PCIEM_FLAGS_SLOT 0x0100 +#define PCIEM_FLAGS_IRQ 0x3e00 +#define PCIER_DEVICE_CAP 0x4 +#define PCIEM_CAP_MAX_PAYLOAD 0x00000007 +#define PCIEM_CAP_PHANTHOM_FUNCS 0x00000018 +#define PCIEM_CAP_EXT_TAG_FIELD 0x00000020 +#define PCIEM_CAP_L0S_LATENCY 0x000001c0 +#define PCIEM_CAP_L1_LATENCY 0x00000e00 +#define PCIEM_CAP_ROLE_ERR_RPT 0x00008000 +#define PCIEM_CAP_SLOT_PWR_LIM_VAL 0x03fc0000 +#define PCIEM_CAP_SLOT_PWR_LIM_SCALE 0x0c000000 +#define PCIEM_CAP_FLR 0x10000000 +#define PCIER_DEVICE_CTL 0x8 +#define PCIEM_CTL_COR_ENABLE 0x0001 +#define PCIEM_CTL_NFER_ENABLE 0x0002 +#define PCIEM_CTL_FER_ENABLE 0x0004 +#define PCIEM_CTL_URR_ENABLE 0x0008 +#define PCIEM_CTL_RELAXED_ORD_ENABLE 0x0010 +#define PCIEM_CTL_MAX_PAYLOAD 0x00e0 +#define PCIEM_CTL_EXT_TAG_FIELD 0x0100 +#define PCIEM_CTL_PHANTHOM_FUNCS 0x0200 +#define PCIEM_CTL_AUX_POWER_PM 0x0400 +#define PCIEM_CTL_NOSNOOP_ENABLE 0x0800 +#define PCIEM_CTL_MAX_READ_REQUEST 0x7000 +#define PCIEM_CTL_BRDG_CFG_RETRY 0x8000 /* PCI-E - PCI/PCI-X bridges */ +#define PCIEM_CTL_INITIATE_FLR 0x8000 /* FLR capable endpoints */ +#define PCIER_DEVICE_STA 0xa +#define PCIEM_STA_CORRECTABLE_ERROR 0x0001 +#define PCIEM_STA_NON_FATAL_ERROR 0x0002 +#define PCIEM_STA_FATAL_ERROR 0x0004 +#define PCIEM_STA_UNSUPPORTED_REQ 0x0008 +#define PCIEM_STA_AUX_POWER 0x0010 +#define PCIEM_STA_TRANSACTION_PND 0x0020 +#define PCIER_LINK_CAP 0xc +#define PCIEM_LINK_CAP_MAX_SPEED 0x0000000f +#define PCIEM_LINK_CAP_MAX_WIDTH 0x000003f0 +#define PCIEM_LINK_CAP_ASPM 0x00000c00 +#define PCIEM_LINK_CAP_L0S_EXIT 0x00007000 +#define PCIEM_LINK_CAP_L1_EXIT 0x00038000 +#define PCIEM_LINK_CAP_CLOCK_PM 0x00040000 +#define PCIEM_LINK_CAP_SURPRISE_DOWN 0x00080000 +#define PCIEM_LINK_CAP_DL_ACTIVE 0x00100000 +#define PCIEM_LINK_CAP_LINK_BW_NOTIFY 0x00200000 +#define PCIEM_LINK_CAP_ASPM_COMPLIANCE 0x00400000 +#define PCIEM_LINK_CAP_PORT 0xff000000 +#define PCIER_LINK_CTL 0x10 +#define PCIEM_LINK_CTL_ASPMC_DIS 0x0000 +#define PCIEM_LINK_CTL_ASPMC_L0S 0x0001 +#define PCIEM_LINK_CTL_ASPMC_L1 0x0002 +#define PCIEM_LINK_CTL_ASPMC 0x0003 +#define PCIEM_LINK_CTL_RCB 0x0008 +#define PCIEM_LINK_CTL_LINK_DIS 0x0010 +#define PCIEM_LINK_CTL_RETRAIN_LINK 0x0020 +#define PCIEM_LINK_CTL_COMMON_CLOCK 0x0040 +#define PCIEM_LINK_CTL_EXTENDED_SYNC 0x0080 +#define PCIEM_LINK_CTL_ECPM 0x0100 +#define PCIEM_LINK_CTL_HAWD 0x0200 +#define PCIEM_LINK_CTL_LBMIE 0x0400 +#define PCIEM_LINK_CTL_LABIE 0x0800 +#define PCIER_LINK_STA 0x12 +#define PCIEM_LINK_STA_SPEED 0x000f +#define PCIEM_LINK_STA_WIDTH 0x03f0 +#define PCIEM_LINK_STA_TRAINING_ERROR 0x0400 +#define PCIEM_LINK_STA_TRAINING 0x0800 +#define PCIEM_LINK_STA_SLOT_CLOCK 0x1000 +#define PCIEM_LINK_STA_DL_ACTIVE 0x2000 +#define PCIEM_LINK_STA_LINK_BW_MGMT 0x4000 +#define PCIEM_LINK_STA_LINK_AUTO_BW 0x8000 +#define PCIER_SLOT_CAP 0x14 +#define PCIEM_SLOT_CAP_APB 0x00000001 +#define PCIEM_SLOT_CAP_PCP 0x00000002 +#define PCIEM_SLOT_CAP_MRLSP 0x00000004 +#define PCIEM_SLOT_CAP_AIP 0x00000008 +#define PCIEM_SLOT_CAP_PIP 0x00000010 +#define PCIEM_SLOT_CAP_HPS 0x00000020 +#define PCIEM_SLOT_CAP_HPC 0x00000040 +#define PCIEM_SLOT_CAP_SPLV 0x00007f80 +#define PCIEM_SLOT_CAP_SPLS 0x00018000 +#define PCIEM_SLOT_CAP_EIP 0x00020000 +#define PCIEM_SLOT_CAP_NCCS 0x00040000 +#define PCIEM_SLOT_CAP_PSN 0xfff80000 +#define PCIER_SLOT_CTL 0x18 +#define PCIEM_SLOT_CTL_ABPE 0x0001 +#define PCIEM_SLOT_CTL_PFDE 0x0002 +#define PCIEM_SLOT_CTL_MRLSCE 0x0004 +#define PCIEM_SLOT_CTL_PDCE 0x0008 +#define PCIEM_SLOT_CTL_CCIE 0x0010 +#define PCIEM_SLOT_CTL_HPIE 0x0020 +#define PCIEM_SLOT_CTL_AIC 0x00c0 +#define PCIEM_SLOT_CTL_PIC 0x0300 +#define PCIEM_SLOT_CTL_PCC 0x0400 +#define PCIEM_SLOT_CTL_EIC 0x0800 +#define PCIEM_SLOT_CTL_DLLSCE 0x1000 +#define PCIER_SLOT_STA 0x1a +#define PCIEM_SLOT_STA_ABP 0x0001 +#define PCIEM_SLOT_STA_PFD 0x0002 +#define PCIEM_SLOT_STA_MRLSC 0x0004 +#define PCIEM_SLOT_STA_PDC 0x0008 +#define PCIEM_SLOT_STA_CC 0x0010 +#define PCIEM_SLOT_STA_MRLSS 0x0020 +#define PCIEM_SLOT_STA_PDS 0x0040 +#define PCIEM_SLOT_STA_EIS 0x0080 +#define PCIEM_SLOT_STA_DLLSC 0x0100 +#define PCIER_ROOT_CTL 0x1c +#define PCIEM_ROOT_CTL_SERR_CORR 0x0001 +#define PCIEM_ROOT_CTL_SERR_NONFATAL 0x0002 +#define PCIEM_ROOT_CTL_SERR_FATAL 0x0004 +#define PCIEM_ROOT_CTL_PME 0x0008 +#define PCIEM_ROOT_CTL_CRS_VIS 0x0010 +#define PCIER_ROOT_CAP 0x1e +#define PCIEM_ROOT_CAP_CRS_VIS 0x0001 +#define PCIER_ROOT_STA 0x20 +#define PCIEM_ROOT_STA_PME_REQID_MASK 0x0000ffff +#define PCIEM_ROOT_STA_PME_STATUS 0x00010000 +#define PCIEM_ROOT_STA_PME_PEND 0x00020000 +#define PCIER_DEVICE_CAP2 0x24 +#define PCIEM_CAP2_ARI 0x20 +#define PCIER_DEVICE_CTL2 0x28 +#define PCIEM_CTL2_COMP_TIMEOUT_VAL 0x000f +#define PCIEM_CTL2_COMP_TIMEOUT_DIS 0x0010 +#define PCIEM_CTL2_ARI 0x0020 +#define PCIEM_CTL2_ATOMIC_REQ_ENABLE 0x0040 +#define PCIEM_CTL2_ATOMIC_EGR_BLOCK 0x0080 +#define PCIEM_CTL2_ID_ORDERED_REQ_EN 0x0100 +#define PCIEM_CTL2_ID_ORDERED_CMP_EN 0x0200 +#define PCIEM_CTL2_LTR_ENABLE 0x0400 +#define PCIEM_CTL2_OBFF 0x6000 +#define PCIEM_OBFF_DISABLE 0x0000 +#define PCIEM_OBFF_MSGA_ENABLE 0x2000 +#define PCIEM_OBFF_MSGB_ENABLE 0x4000 +#define PCIEM_OBFF_WAKE_ENABLE 0x6000 +#define PCIEM_CTL2_END2END_TLP 0x8000 +#define PCIER_DEVICE_STA2 0x2a +#define PCIER_LINK_CAP2 0x2c +#define PCIER_LINK_CTL2 0x30 +#define PCIER_LINK_STA2 0x32 +#define PCIER_SLOT_CAP2 0x34 +#define PCIER_SLOT_CTL2 0x38 +#define PCIER_SLOT_STA2 0x3a + +/* MSI-X definitions */ +#define PCIR_MSIX_CTRL 0x2 +#define PCIM_MSIXCTRL_MSIX_ENABLE 0x8000 +#define PCIM_MSIXCTRL_FUNCTION_MASK 0x4000 +#define PCIM_MSIXCTRL_TABLE_SIZE 0x07FF +#define PCIR_MSIX_TABLE 0x4 +#define PCIR_MSIX_PBA 0x8 +#define PCIM_MSIX_BIR_MASK 0x7 +#define PCIM_MSIX_BIR_BAR_10 0 +#define PCIM_MSIX_BIR_BAR_14 1 +#define PCIM_MSIX_BIR_BAR_18 2 +#define PCIM_MSIX_BIR_BAR_1C 3 +#define PCIM_MSIX_BIR_BAR_20 4 +#define PCIM_MSIX_BIR_BAR_24 5 +#define PCIM_MSIX_VCTRL_MASK 0x1 + +/* PCI Advanced Features definitions */ +#define PCIR_PCIAF_CAP 0x3 +#define PCIM_PCIAFCAP_TP 0x01 +#define PCIM_PCIAFCAP_FLR 0x02 +#define PCIR_PCIAF_CTRL 0x4 +#define PCIR_PCIAFCTRL_FLR 0x01 +#define PCIR_PCIAF_STATUS 0x5 +#define PCIR_PCIAFSTATUS_TP 0x01 + +/* Advanced Error Reporting */ +#define PCIR_AER_UC_STATUS 0x04 +#define PCIM_AER_UC_TRAINING_ERROR 0x00000001 +#define PCIM_AER_UC_DL_PROTOCOL_ERROR 0x00000010 +#define PCIM_AER_UC_SURPRISE_LINK_DOWN 0x00000020 +#define PCIM_AER_UC_POISONED_TLP 0x00001000 +#define PCIM_AER_UC_FC_PROTOCOL_ERROR 0x00002000 +#define PCIM_AER_UC_COMPLETION_TIMEOUT 0x00004000 +#define PCIM_AER_UC_COMPLETER_ABORT 0x00008000 +#define PCIM_AER_UC_UNEXPECTED_COMPLETION 0x00010000 +#define PCIM_AER_UC_RECEIVER_OVERFLOW 0x00020000 +#define PCIM_AER_UC_MALFORMED_TLP 0x00040000 +#define PCIM_AER_UC_ECRC_ERROR 0x00080000 +#define PCIM_AER_UC_UNSUPPORTED_REQUEST 0x00100000 +#define PCIM_AER_UC_ACS_VIOLATION 0x00200000 +#define PCIM_AER_UC_INTERNAL_ERROR 0x00400000 +#define PCIM_AER_UC_MC_BLOCKED_TLP 0x00800000 +#define PCIM_AER_UC_ATOMIC_EGRESS_BLK 0x01000000 +#define PCIM_AER_UC_TLP_PREFIX_BLOCKED 0x02000000 +#define PCIR_AER_UC_MASK 0x08 /* Shares bits with UC_STATUS */ +#define PCIR_AER_UC_SEVERITY 0x0c /* Shares bits with UC_STATUS */ +#define PCIR_AER_COR_STATUS 0x10 +#define PCIM_AER_COR_RECEIVER_ERROR 0x00000001 +#define PCIM_AER_COR_BAD_TLP 0x00000040 +#define PCIM_AER_COR_BAD_DLLP 0x00000080 +#define PCIM_AER_COR_REPLAY_ROLLOVER 0x00000100 +#define PCIM_AER_COR_REPLAY_TIMEOUT 0x00001000 +#define PCIM_AER_COR_ADVISORY_NF_ERROR 0x00002000 +#define PCIM_AER_COR_INTERNAL_ERROR 0x00004000 +#define PCIM_AER_COR_HEADER_LOG_OVFLOW 0x00008000 +#define PCIR_AER_COR_MASK 0x14 /* Shares bits with COR_STATUS */ +#define PCIR_AER_CAP_CONTROL 0x18 +#define PCIM_AER_FIRST_ERROR_PTR 0x0000001f +#define PCIM_AER_ECRC_GEN_CAPABLE 0x00000020 +#define PCIM_AER_ECRC_GEN_ENABLE 0x00000040 +#define PCIM_AER_ECRC_CHECK_CAPABLE 0x00000080 +#define PCIM_AER_ECRC_CHECK_ENABLE 0x00000100 +#define PCIM_AER_MULT_HDR_CAPABLE 0x00000200 +#define PCIM_AER_MULT_HDR_ENABLE 0x00000400 +#define PCIM_AER_TLP_PREFIX_LOG_PRESENT 0x00000800 +#define PCIR_AER_HEADER_LOG 0x1c +#define PCIR_AER_ROOTERR_CMD 0x2c /* Only for root complex ports */ +#define PCIM_AER_ROOTERR_COR_ENABLE 0x00000001 +#define PCIM_AER_ROOTERR_NF_ENABLE 0x00000002 +#define PCIM_AER_ROOTERR_F_ENABLE 0x00000004 +#define PCIR_AER_ROOTERR_STATUS 0x30 /* Only for root complex ports */ +#define PCIM_AER_ROOTERR_COR_ERR 0x00000001 +#define PCIM_AER_ROOTERR_MULTI_COR_ERR 0x00000002 +#define PCIM_AER_ROOTERR_UC_ERR 0x00000004 +#define PCIM_AER_ROOTERR_MULTI_UC_ERR 0x00000008 +#define PCIM_AER_ROOTERR_FIRST_UC_FATAL 0x00000010 +#define PCIM_AER_ROOTERR_NF_ERR 0x00000020 +#define PCIM_AER_ROOTERR_F_ERR 0x00000040 +#define PCIM_AER_ROOTERR_INT_MESSAGE 0xf8000000 +#define PCIR_AER_COR_SOURCE_ID 0x34 /* Only for root complex ports */ +#define PCIR_AER_ERR_SOURCE_ID 0x36 /* Only for root complex ports */ +#define PCIR_AER_TLP_PREFIX_LOG 0x38 /* Only for TLP prefix functions */ + +/* Virtual Channel definitions */ +#define PCIR_VC_CAP1 0x04 +#define PCIM_VC_CAP1_EXT_COUNT 0x00000007 +#define PCIM_VC_CAP1_LOWPRI_EXT_COUNT 0x00000070 +#define PCIR_VC_CAP2 0x08 +#define PCIR_VC_CONTROL 0x0C +#define PCIR_VC_STATUS 0x0E +#define PCIR_VC_RESOURCE_CAP(n) (0x10 + (n) * 0x0C) +#define PCIR_VC_RESOURCE_CTL(n) (0x14 + (n) * 0x0C) +#define PCIR_VC_RESOURCE_STA(n) (0x18 + (n) * 0x0C) + +/* Serial Number definitions */ +#define PCIR_SERIAL_LOW 0x04 +#define PCIR_SERIAL_HIGH 0x08 + diff --git a/usr/contrib/freebsd/isa/isareg.h b/usr/contrib/freebsd/isa/isareg.h new file mode 100644 index 0000000000..e83e34674f --- /dev/null +++ b/usr/contrib/freebsd/isa/isareg.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)isa.h 5.7 (Berkeley) 5/9/91 + * $FreeBSD: head/sys/isa/isareg.h 263379 2014-03-19 21:03:04Z imp $ + */ + +#ifdef PC98 +#error isareg.h is included from PC-9801 source +#endif + +#ifndef _ISA_ISA_H_ +#define _ISA_ISA_H_ + +/* + * ISA Bus conventions + */ + +/* + * Input / Output Port Assignments + */ +#ifndef IO_ISABEGIN +#define IO_ISABEGIN 0x000 /* 0x000 - Beginning of I/O Registers */ +#define IO_ICU1 0x020 /* 8259A Interrupt Controller #1 */ +#define IO_KBD 0x060 /* 8042 Keyboard */ +#define IO_RTC 0x070 /* RTC */ +#define IO_ICU2 0x0A0 /* 8259A Interrupt Controller #2 */ + +#define IO_MDA 0x3B0 /* Monochome Adapter */ +#define IO_VGA 0x3C0 /* E/VGA Ports */ +#define IO_CGA 0x3D0 /* CGA Ports */ + +#endif /* !IO_ISABEGIN */ + +/* + * Input / Output Port Sizes + */ +#define IO_CGASIZE 12 /* CGA controllers */ +#define IO_MDASIZE 12 /* Monochrome display controllers */ +#define IO_VGASIZE 16 /* VGA controllers */ + +#endif /* !_ISA_ISA_H_ */ diff --git a/usr/contrib/freebsd/lib/libutil/expand_number.c b/usr/contrib/freebsd/lib/libutil/expand_number.c new file mode 100644 index 0000000000..f3b4da89f9 --- /dev/null +++ b/usr/contrib/freebsd/lib/libutil/expand_number.c @@ -0,0 +1,93 @@ +/*- + * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org> + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 255069 2013-08-30 11:21:52Z pluknet $"); + +#include <sys/types.h> +#include <ctype.h> +#include <errno.h> +#include <inttypes.h> +#include <libutil.h> +#include <stdint.h> + +int +expand_number(const char *buf, uint64_t *num) +{ + char *endptr; + uintmax_t umaxval; + uint64_t number; + unsigned shift; + int serrno; + + serrno = errno; + errno = 0; + umaxval = strtoumax(buf, &endptr, 0); + if (umaxval > UINT64_MAX) + errno = ERANGE; + if (errno != 0) + return (-1); + errno = serrno; + number = umaxval; + + switch (tolower((unsigned char)*endptr)) { + case 'e': + shift = 60; + break; + case 'p': + shift = 50; + break; + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': /* No unit. */ + *num = number; + return (0); + default: + /* Unrecognized unit. */ + errno = EINVAL; + return (-1); + } + + if ((number << shift) >> shift != number) { + /* Overflow */ + errno = ERANGE; + return (-1); + } + *num = number << shift; + return (0); +} diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h new file mode 100644 index 0000000000..705460355f --- /dev/null +++ b/usr/contrib/freebsd/sys/ata.h @@ -0,0 +1,635 @@ +/*- + * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $ + */ + +#ifndef _SYS_ATA_H_ +#define _SYS_ATA_H_ + +#include <sys/ioccom.h> + +/* ATA/ATAPI device parameters */ +struct ata_params { +/*000*/ u_int16_t config; /* configuration info */ +#define ATA_PROTO_MASK 0x8003 +#define ATA_PROTO_ATAPI 0x8000 +#define ATA_PROTO_ATAPI_12 0x8000 +#define ATA_PROTO_ATAPI_16 0x8001 +#define ATA_PROTO_CFA 0x848a +#define ATA_ATAPI_TYPE_MASK 0x1f00 +#define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */ +#define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ +#define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ +#define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ +#define ATA_DRQ_MASK 0x0060 +#define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ +#define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ +#define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */ +#define ATA_RESP_INCOMPLETE 0x0004 + +/*001*/ u_int16_t cylinders; /* # of cylinders */ +/*002*/ u_int16_t specconf; /* specific configuration */ +/*003*/ u_int16_t heads; /* # heads */ + u_int16_t obsolete4; + u_int16_t obsolete5; +/*006*/ u_int16_t sectors; /* # sectors/track */ +/*007*/ u_int16_t vendor7[3]; +/*010*/ u_int8_t serial[20]; /* serial number */ +/*020*/ u_int16_t retired20; + u_int16_t retired21; + u_int16_t obsolete22; +/*023*/ u_int8_t revision[8]; /* firmware revision */ +/*027*/ u_int8_t model[40]; /* model name */ +/*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ +/*048*/ u_int16_t usedmovsd; /* double word read/write? */ +/*049*/ u_int16_t capabilities1; +#define ATA_SUPPORT_DMA 0x0100 +#define ATA_SUPPORT_LBA 0x0200 +#define ATA_SUPPORT_IORDY 0x0400 +#define ATA_SUPPORT_IORDYDIS 0x0800 +#define ATA_SUPPORT_OVERLAP 0x4000 + +/*050*/ u_int16_t capabilities2; +/*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */ +#define ATA_RETIRED_PIO_MASK 0x0300 + +/*052*/ u_int16_t retired_dmamode; /* DMA modes */ +#define ATA_RETIRED_DMA_MASK 0x0003 + +/*053*/ u_int16_t atavalid; /* fields valid */ +#define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */ +#define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */ +#define ATA_FLAG_88 0x0004 /* word 88 valid */ + +/*054*/ u_int16_t current_cylinders; +/*055*/ u_int16_t current_heads; +/*056*/ u_int16_t current_sectors; +/*057*/ u_int16_t current_size_1; +/*058*/ u_int16_t current_size_2; +/*059*/ u_int16_t multi; +#define ATA_MULTI_VALID 0x0100 + +/*060*/ u_int16_t lba_size_1; + u_int16_t lba_size_2; + u_int16_t obsolete62; +/*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */ +/*064*/ u_int16_t apiomodes; /* advanced PIO modes */ + +/*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */ +/*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */ +/*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */ +/*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */ +/*069*/ u_int16_t support3; +#define ATA_SUPPORT_RZAT 0x0020 +#define ATA_SUPPORT_DRAT 0x4000 + u_int16_t reserved70; +/*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ +/*072*/ u_int16_t rlsservice; /* rel time (us) for service */ + u_int16_t reserved73; + u_int16_t reserved74; +/*075*/ u_int16_t queue; +#define ATA_QUEUE_LEN(x) ((x) & 0x001f) + +/*76*/ u_int16_t satacapabilities; +#define ATA_SATA_GEN1 0x0002 +#define ATA_SATA_GEN2 0x0004 +#define ATA_SATA_GEN3 0x0008 +#define ATA_SUPPORT_NCQ 0x0100 +#define ATA_SUPPORT_IFPWRMNGTRCV 0x0200 +#define ATA_SUPPORT_PHYEVENTCNT 0x0400 +#define ATA_SUPPORT_NCQ_UNLOAD 0x0800 +#define ATA_SUPPORT_NCQ_PRIO 0x1000 +#define ATA_SUPPORT_HAPST 0x2000 +#define ATA_SUPPORT_DAPST 0x4000 +#define ATA_SUPPORT_READLOGDMAEXT 0x8000 + +/*77*/ u_int16_t satacapabilities2; +#define ATA_SATA_CURR_GEN_MASK 0x0006 +#define ATA_SUPPORT_NCQ_STREAM 0x0010 +#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 +#define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 +/*78*/ u_int16_t satasupport; +#define ATA_SUPPORT_NONZERO 0x0002 +#define ATA_SUPPORT_AUTOACTIVATE 0x0004 +#define ATA_SUPPORT_IFPWRMNGT 0x0008 +#define ATA_SUPPORT_INORDERDATA 0x0010 +#define ATA_SUPPORT_ASYNCNOTIF 0x0020 +#define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 +/*79*/ u_int16_t sataenabled; +#define ATA_ENABLED_DAPST 0x0080 + +/*080*/ u_int16_t version_major; +/*081*/ u_int16_t version_minor; + + struct { +/*082/085*/ u_int16_t command1; +#define ATA_SUPPORT_SMART 0x0001 +#define ATA_SUPPORT_SECURITY 0x0002 +#define ATA_SUPPORT_REMOVABLE 0x0004 +#define ATA_SUPPORT_POWERMGT 0x0008 +#define ATA_SUPPORT_PACKET 0x0010 +#define ATA_SUPPORT_WRITECACHE 0x0020 +#define ATA_SUPPORT_LOOKAHEAD 0x0040 +#define ATA_SUPPORT_RELEASEIRQ 0x0080 +#define ATA_SUPPORT_SERVICEIRQ 0x0100 +#define ATA_SUPPORT_RESET 0x0200 +#define ATA_SUPPORT_PROTECTED 0x0400 +#define ATA_SUPPORT_WRITEBUFFER 0x1000 +#define ATA_SUPPORT_READBUFFER 0x2000 +#define ATA_SUPPORT_NOP 0x4000 + +/*083/086*/ u_int16_t command2; +#define ATA_SUPPORT_MICROCODE 0x0001 +#define ATA_SUPPORT_QUEUED 0x0002 +#define ATA_SUPPORT_CFA 0x0004 +#define ATA_SUPPORT_APM 0x0008 +#define ATA_SUPPORT_NOTIFY 0x0010 +#define ATA_SUPPORT_STANDBY 0x0020 +#define ATA_SUPPORT_SPINUP 0x0040 +#define ATA_SUPPORT_MAXSECURITY 0x0100 +#define ATA_SUPPORT_AUTOACOUSTIC 0x0200 +#define ATA_SUPPORT_ADDRESS48 0x0400 +#define ATA_SUPPORT_OVERLAY 0x0800 +#define ATA_SUPPORT_FLUSHCACHE 0x1000 +#define ATA_SUPPORT_FLUSHCACHE48 0x2000 + +/*084/087*/ u_int16_t extension; +#define ATA_SUPPORT_SMARTLOG 0x0001 +#define ATA_SUPPORT_SMARTTEST 0x0002 +#define ATA_SUPPORT_MEDIASN 0x0004 +#define ATA_SUPPORT_MEDIAPASS 0x0008 +#define ATA_SUPPORT_STREAMING 0x0010 +#define ATA_SUPPORT_GENLOG 0x0020 +#define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040 +#define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080 +#define ATA_SUPPORT_64BITWWN 0x0100 +#define ATA_SUPPORT_UNLOAD 0x2000 + } __packed support, enabled; + +/*088*/ u_int16_t udmamodes; /* UltraDMA modes */ +/*089*/ u_int16_t erase_time; /* time req'd in 2min units */ +/*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */ +/*091*/ u_int16_t apm_value; +/*092*/ u_int16_t master_passwd_revision; /* password revision code */ +/*093*/ u_int16_t hwres; +#define ATA_CABLE_ID 0x2000 + +/*094*/ u_int16_t acoustic; +#define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff) +#define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8) + +/*095*/ u_int16_t stream_min_req_size; +/*096*/ u_int16_t stream_transfer_time; +/*097*/ u_int16_t stream_access_latency; +/*098*/ u_int32_t stream_granularity; +/*100*/ u_int16_t lba_size48_1; + u_int16_t lba_size48_2; + u_int16_t lba_size48_3; + u_int16_t lba_size48_4; + u_int16_t reserved104; +/*105*/ u_int16_t max_dsm_blocks; +/*106*/ u_int16_t pss; +#define ATA_PSS_LSPPS 0x000F +#define ATA_PSS_LSSABOVE512 0x1000 +#define ATA_PSS_MULTLS 0x2000 +#define ATA_PSS_VALID_MASK 0xC000 +#define ATA_PSS_VALID_VALUE 0x4000 +/*107*/ u_int16_t isd; +/*108*/ u_int16_t wwn[4]; + u_int16_t reserved112[5]; +/*117*/ u_int16_t lss_1; +/*118*/ u_int16_t lss_2; +/*119*/ u_int16_t support2; +#define ATA_SUPPORT_WRITEREADVERIFY 0x0002 +#define ATA_SUPPORT_WRITEUNCORREXT 0x0004 +#define ATA_SUPPORT_RWLOGDMAEXT 0x0008 +#define ATA_SUPPORT_MICROCODE3 0x0010 +#define ATA_SUPPORT_FREEFALL 0x0020 +/*120*/ u_int16_t enabled2; + u_int16_t reserved121[6]; +/*127*/ u_int16_t removable_status; +/*128*/ u_int16_t security_status; +#define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */ +#define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */ +#define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */ +#define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */ +#define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */ +#define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */ +#define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */ + + u_int16_t reserved129[31]; +/*160*/ u_int16_t cfa_powermode1; + u_int16_t reserved161; +/*162*/ u_int16_t cfa_kms_support; +/*163*/ u_int16_t cfa_trueide_modes; +/*164*/ u_int16_t cfa_memory_modes; + u_int16_t reserved165[4]; +/*169*/ u_int16_t support_dsm; +#define ATA_SUPPORT_DSM_TRIM 0x0001 + u_int16_t reserved170[6]; +/*176*/ u_int8_t media_serial[60]; +/*206*/ u_int16_t sct; + u_int16_t reserved206[2]; +/*209*/ u_int16_t lsalign; +/*210*/ u_int16_t wrv_sectors_m3_1; + u_int16_t wrv_sectors_m3_2; +/*212*/ u_int16_t wrv_sectors_m2_1; + u_int16_t wrv_sectors_m2_2; +/*214*/ u_int16_t nv_cache_caps; +/*215*/ u_int16_t nv_cache_size_1; + u_int16_t nv_cache_size_2; +/*217*/ u_int16_t media_rotation_rate; +#define ATA_RATE_NOT_REPORTED 0x0000 +#define ATA_RATE_NON_ROTATING 0x0001 + u_int16_t reserved218; +/*219*/ u_int16_t nv_cache_opt; +/*220*/ u_int16_t wrv_mode; + u_int16_t reserved221; +/*222*/ u_int16_t transport_major; +/*223*/ u_int16_t transport_minor; + u_int16_t reserved224[31]; +/*255*/ u_int16_t integrity; +} __packed; + +/* ATA Dataset Management */ +#define ATA_DSM_BLK_SIZE 512 +#define ATA_DSM_BLK_RANGES 64 +#define ATA_DSM_RANGE_SIZE 8 +#define ATA_DSM_RANGE_MAX 65535 + +/* + * ATA Device Register + * + * bit 7 Obsolete (was 1 in early ATA specs) + * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS + * bit 5 Obsolete (was 1 in early ATA specs) + * bit 4 1 = Slave Drive, 0 = Master Drive + * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number +*/ + +#define ATA_DEV_MASTER 0x00 +#define ATA_DEV_SLAVE 0x10 +#define ATA_DEV_LBA 0x40 + +/* ATA limits */ +#define ATA_MAX_28BIT_LBA 268435455UL + +/* ATA Status Register */ +#define ATA_STATUS_ERROR 0x01 +#define ATA_STATUS_DEVICE_FAULT 0x20 + +/* ATA Error Register */ +#define ATA_ERROR_ABORT 0x04 +#define ATA_ERROR_ID_NOT_FOUND 0x10 + +/* ATA HPA Features */ +#define ATA_HPA_FEAT_MAX_ADDR 0x00 +#define ATA_HPA_FEAT_SET_PWD 0x01 +#define ATA_HPA_FEAT_LOCK 0x02 +#define ATA_HPA_FEAT_UNLOCK 0x03 +#define ATA_HPA_FEAT_FREEZE 0x04 + +/* ATA transfer modes */ +#define ATA_MODE_MASK 0x0f +#define ATA_DMA_MASK 0xf0 +#define ATA_PIO 0x00 +#define ATA_PIO0 0x08 +#define ATA_PIO1 0x09 +#define ATA_PIO2 0x0a +#define ATA_PIO3 0x0b +#define ATA_PIO4 0x0c +#define ATA_PIO_MAX 0x0f +#define ATA_DMA 0x10 +#define ATA_WDMA0 0x20 +#define ATA_WDMA1 0x21 +#define ATA_WDMA2 0x22 +#define ATA_UDMA0 0x40 +#define ATA_UDMA1 0x41 +#define ATA_UDMA2 0x42 +#define ATA_UDMA3 0x43 +#define ATA_UDMA4 0x44 +#define ATA_UDMA5 0x45 +#define ATA_UDMA6 0x46 +#define ATA_SA150 0x47 +#define ATA_SA300 0x48 +#define ATA_DMA_MAX 0x4f + + +/* ATA commands */ +#define ATA_NOP 0x00 /* NOP */ +#define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ +#define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */ +#define ATA_DATA_SET_MANAGEMENT 0x06 +#define ATA_DSM_TRIM 0x01 +#define ATA_DEVICE_RESET 0x08 /* reset device */ +#define ATA_READ 0x20 /* read */ +#define ATA_READ48 0x24 /* read 48bit LBA */ +#define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */ +#define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */ +#define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */ +#define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */ +#define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */ +#define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */ +#define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */ +#define ATA_WRITE 0x30 /* write */ +#define ATA_WRITE48 0x34 /* write 48bit LBA */ +#define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */ +#define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/ +#define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */ +#define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */ +#define ATA_WRITE_STREAM_DMA48 0x3a +#define ATA_WRITE_STREAM48 0x3b +#define ATA_WRITE_DMA_FUA48 0x3d +#define ATA_WRITE_DMA_QUEUED_FUA48 0x3e +#define ATA_WRITE_LOG_EXT 0x3f +#define ATA_READ_VERIFY 0x40 +#define ATA_READ_VERIFY48 0x42 +#define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ +#define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ +#define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ +#define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ +#define ATA_RECV_FPDMA_QUEUED 0x65 /* recieve DMA NCQ */ +#define ATA_SEP_ATTN 0x67 /* SEP request */ +#define ATA_SEEK 0x70 /* seek */ +#define ATA_PACKET_CMD 0xa0 /* packet command */ +#define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ +#define ATA_SERVICE 0xa2 /* service command */ +#define ATA_SMART_CMD 0xb0 /* SMART command */ +#define ATA_CFA_ERASE 0xc0 /* CFA erase */ +#define ATA_READ_MUL 0xc4 /* read multi */ +#define ATA_WRITE_MUL 0xc5 /* write multi */ +#define ATA_SET_MULTI 0xc6 /* set multi size */ +#define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */ +#define ATA_READ_DMA 0xc8 /* read DMA */ +#define ATA_WRITE_DMA 0xca /* write DMA */ +#define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */ +#define ATA_WRITE_MUL_FUA48 0xce +#define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */ +#define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */ +#define ATA_STANDBY_CMD 0xe2 /* standby */ +#define ATA_IDLE_CMD 0xe3 /* idle */ +#define ATA_READ_BUFFER 0xe4 /* read buffer */ +#define ATA_READ_PM 0xe4 /* read portmultiplier */ +#define ATA_SLEEP 0xe6 /* sleep */ +#define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ +#define ATA_WRITE_PM 0xe8 /* write portmultiplier */ +#define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ +#define ATA_ATA_IDENTIFY 0xec /* get ATA params */ +#define ATA_SETFEATURES 0xef /* features command */ +#define ATA_SF_SETXFER 0x03 /* set transfer mode */ +#define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ +#define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ +#define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ +#define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ +#define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ +#define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ +#define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ +#define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ +#define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ +#define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ +#define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ +#define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ +#define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ +#define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ +#define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */ +#define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */ +#define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */ +#define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ +#define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ + + +/* ATAPI commands */ +#define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ +#define ATAPI_REZERO 0x01 /* rewind */ +#define ATAPI_REQUEST_SENSE 0x03 /* get sense data */ +#define ATAPI_FORMAT 0x04 /* format unit */ +#define ATAPI_READ 0x08 /* read data */ +#define ATAPI_WRITE 0x0a /* write data */ +#define ATAPI_WEOF 0x10 /* write filemark */ +#define ATAPI_WF_WRITE 0x01 +#define ATAPI_SPACE 0x11 /* space command */ +#define ATAPI_SP_FM 0x01 +#define ATAPI_SP_EOD 0x03 +#define ATAPI_INQUIRY 0x12 /* get inquiry data */ +#define ATAPI_MODE_SELECT 0x15 /* mode select */ +#define ATAPI_ERASE 0x19 /* erase */ +#define ATAPI_MODE_SENSE 0x1a /* mode sense */ +#define ATAPI_START_STOP 0x1b /* start/stop unit */ +#define ATAPI_SS_LOAD 0x01 +#define ATAPI_SS_RETENSION 0x02 +#define ATAPI_SS_EJECT 0x04 +#define ATAPI_PREVENT_ALLOW 0x1e /* media removal */ +#define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */ +#define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */ +#define ATAPI_READ_BIG 0x28 /* read data */ +#define ATAPI_WRITE_BIG 0x2a /* write data */ +#define ATAPI_LOCATE 0x2b /* locate to position */ +#define ATAPI_READ_POSITION 0x34 /* read position */ +#define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */ +#define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */ +#define ATAPI_READ_BUFFER 0x3c /* read device buffer */ +#define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */ +#define ATAPI_READ_TOC 0x43 /* get table of contents */ +#define ATAPI_PLAY_10 0x45 /* play by lba */ +#define ATAPI_PLAY_MSF 0x47 /* play by MSF address */ +#define ATAPI_PLAY_TRACK 0x48 /* play by track number */ +#define ATAPI_PAUSE 0x4b /* pause audio operation */ +#define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */ +#define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */ +#define ATAPI_RESERVE_TRACK 0x53 /* reserve track */ +#define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */ +#define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */ +#define ATAPI_REPAIR_TRACK 0x58 /* repair track */ +#define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */ +#define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */ +#define ATAPI_CLOSE_TRACK 0x5b /* close track/session */ +#define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */ +#define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */ +#define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */ +#define ATAPI_BLANK 0xa1 /* blank the media */ +#define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */ +#define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */ +#define ATAPI_PLAY_12 0xa5 /* play by lba */ +#define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */ +#define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */ +#define ATAPI_PLAY_CD 0xb4 /* universal play command */ +#define ATAPI_SET_SPEED 0xbb /* set drive speed */ +#define ATAPI_MECH_STATUS 0xbd /* get changer status */ +#define ATAPI_READ_CD 0xbe /* read data */ +#define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ + + +struct ata_ioc_devices { + int channel; + char name[2][32]; + struct ata_params params[2]; +}; + +/* pr channel ATA ioctl calls */ +#define IOCATAGMAXCHANNEL _IOR('a', 1, int) +#define IOCATAREINIT _IOW('a', 2, int) +#define IOCATAATTACH _IOW('a', 3, int) +#define IOCATADETACH _IOW('a', 4, int) +#define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices) + +/* ATAPI request sense structure */ +struct atapi_sense { + u_int8_t error; /* current or deferred errors */ +#define ATA_SENSE_VALID 0x80 + + u_int8_t segment; /* segment number */ + u_int8_t key; /* sense key */ +#define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */ +#define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */ +#define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */ +#define ATA_SENSE_NOT_READY 0x02 /* no access to drive */ +#define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */ +#define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */ +#define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */ +#define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */ +#define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */ +#define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */ +#define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */ +#define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */ +#define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */ +#define ATA_SENSE_EQUAL 0x0c /* equal */ +#define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */ +#define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */ +#define ATA_SENSE_RESERVED 0x0f +#define ATA_SENSE_ILI 0x20; +#define ATA_SENSE_EOM 0x40; +#define ATA_SENSE_FILEMARK 0x80; + + u_int32_t cmd_info; /* cmd information */ + u_int8_t sense_length; /* additional sense len (n-7) */ + u_int32_t cmd_specific_info; /* additional cmd spec info */ + u_int8_t asc; /* additional sense code */ + u_int8_t ascq; /* additional sense code qual */ + u_int8_t replaceable_unit_code; /* replaceable unit code */ + u_int8_t specific; /* sense key specific */ +#define ATA_SENSE_SPEC_VALID 0x80 +#define ATA_SENSE_SPEC_MASK 0x7f + + u_int8_t specific1; /* sense key specific */ + u_int8_t specific2; /* sense key specific */ +} __packed; + +struct ata_ioc_request { + union { + struct { + u_int8_t command; + u_int8_t feature; + u_int64_t lba; + u_int16_t count; + } ata; + struct { + char ccb[16]; + struct atapi_sense sense; + } atapi; + } u; + caddr_t data; + int count; + int flags; +#define ATA_CMD_CONTROL 0x01 +#define ATA_CMD_READ 0x02 +#define ATA_CMD_WRITE 0x04 +#define ATA_CMD_ATAPI 0x08 + + int timeout; + int error; +}; + +struct ata_security_password { + u_int16_t ctrl; +#define ATA_SECURITY_PASSWORD_USER 0x0000 +#define ATA_SECURITY_PASSWORD_MASTER 0x0001 +#define ATA_SECURITY_ERASE_NORMAL 0x0000 +#define ATA_SECURITY_ERASE_ENHANCED 0x0002 +#define ATA_SECURITY_LEVEL_HIGH 0x0000 +#define ATA_SECURITY_LEVEL_MAXIMUM 0x0100 + + u_int8_t password[32]; + u_int16_t revision; + u_int16_t reserved[238]; +}; + +/* pr device ATA ioctl calls */ +#define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request) +#define IOCATAGPARM _IOR('a', 101, struct ata_params) +#define IOCATAGMODE _IOR('a', 102, int) +#define IOCATASMODE _IOW('a', 103, int) + +#define IOCATAGSPINDOWN _IOR('a', 104, int) +#define IOCATASSPINDOWN _IOW('a', 105, int) + + +struct ata_ioc_raid_config { + int lun; + int type; +#define AR_JBOD 0x0001 +#define AR_SPAN 0x0002 +#define AR_RAID0 0x0004 +#define AR_RAID1 0x0008 +#define AR_RAID01 0x0010 +#define AR_RAID3 0x0020 +#define AR_RAID4 0x0040 +#define AR_RAID5 0x0080 + + int interleave; + int status; +#define AR_READY 1 +#define AR_DEGRADED 2 +#define AR_REBUILDING 4 + + int progress; + int total_disks; + int disks[16]; +}; + +struct ata_ioc_raid_status { + int lun; + int type; + int interleave; + int status; + int progress; + int total_disks; + struct { + int state; +#define AR_DISK_ONLINE 0x01 +#define AR_DISK_PRESENT 0x02 +#define AR_DISK_SPARE 0x04 + int lun; + } disks[16]; +}; + +/* ATA RAID ioctl calls */ +#define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config) +#define IOCATARAIDDELETE _IOW('a', 201, int) +#define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status) +#define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config) +#define IOCATARAIDREBUILD _IOW('a', 204, int) + +#endif /* _SYS_ATA_H_ */ diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h new file mode 100644 index 0000000000..393dfbc131 --- /dev/null +++ b/usr/contrib/freebsd/sys/linker_set.h @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 1999 John D. Polstra + * Copyright (c) 1999,2001 Peter Wemm <peter@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $ + */ + +#ifndef _SYS_LINKER_SET_H_ +#define _SYS_LINKER_SET_H_ + +#ifdef __FreeBSD__ +#ifndef _SYS_CDEFS_H_ +#error this file needs sys/cdefs.h as a prerequisite +#endif +#else +#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ +#error this file needs sys/cdefs.h as a prerequisite +#endif +#endif + +/* + * The following macros are used to declare global sets of objects, which + * are collected by the linker into a `linker_set' as defined below. + * For ELF, this is done by constructing a separate segment for each set. + */ + +/* + * Private macros, not to be used outside this header file. + */ +#ifdef __GNUCLIKE___SECTION +#ifdef __FreeBSD__ +#define __MAKE_SET(set, sym) \ + __GLOBL(__CONCAT(__start_set_,set)); \ + __GLOBL(__CONCAT(__stop_set_,set)); \ + static void const * const __set_##set##_sym_##sym \ + __section("set_" #set) __used = &sym +#else +#define __MAKE_SET(set, sym) \ + static void const * const __set_##set##_sym_##sym \ + __section("set_" #set) __used = &sym +#endif +#else /* !__GNUCLIKE___SECTION */ +#ifndef lint +#error this file needs to be ported to your compiler +#endif /* lint */ +#define __MAKE_SET(set, sym) extern void const * const (__set_##set##_sym_##sym) +#endif /* __GNUCLIKE___SECTION */ + +/* + * Public macros. + */ +#define TEXT_SET(set, sym) __MAKE_SET(set, sym) +#define DATA_SET(set, sym) __MAKE_SET(set, sym) +#define BSS_SET(set, sym) __MAKE_SET(set, sym) +#define ABS_SET(set, sym) __MAKE_SET(set, sym) +#define SET_ENTRY(set, sym) __MAKE_SET(set, sym) + +/* + * Initialize before referring to a given linker set. + */ +#ifdef __FreeBSD__ +#define SET_DECLARE(set, ptype) \ + extern ptype *__CONCAT(__start_set_,set); \ + extern ptype *__CONCAT(__stop_set_,set) +#else +#define SET_DECLARE(set, ptype) \ + _Pragma(__XSTRING(weak __CONCAT(__start_set_,set))) \ + _Pragma(__XSTRING(weak __CONCAT(__stop_set_,set))) \ + extern ptype *__CONCAT(__start_set_,set); \ + extern ptype *__CONCAT(__stop_set_,set) +#endif + +#define SET_BEGIN(set) \ + (&__CONCAT(__start_set_,set)) +#define SET_LIMIT(set) \ + (&__CONCAT(__stop_set_,set)) + +/* + * Iterate over all the elements of a set. + * + * Sets always contain addresses of things, and "pvar" points to words + * containing those addresses. Thus is must be declared as "type **pvar", + * and the address of each set item is obtained inside the loop by "*pvar". + */ +#define SET_FOREACH(pvar, set) \ + for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++) + +#define SET_ITEM(set, i) \ + ((SET_BEGIN(set))[i]) + +/* + * Provide a count of the items in a set. + */ +#define SET_COUNT(set) \ + (SET_LIMIT(set) - SET_BEGIN(set)) + +#endif /* _SYS_LINKER_SET_H_ */ diff --git a/usr/contrib/freebsd/sys/tree.h b/usr/contrib/freebsd/sys/tree.h new file mode 100644 index 0000000000..6b47e247bb --- /dev/null +++ b/usr/contrib/freebsd/sys/tree.h @@ -0,0 +1,765 @@ +/* $NetBSD: tree.h,v 1.8 2004/03/28 19:38:30 provos Exp $ */ +/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ +/* $FreeBSD: head/sys/sys/tree.h 189204 2009-03-01 04:57:23Z bms $ */ + +/*- + * Copyright 2002 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_TREE_H_ +#define _SYS_TREE_H_ + +#include <sys/cdefs.h> + +/* + * This file defines data structures for different types of trees: + * splay trees and red-black trees. + * + * A splay tree is a self-organizing data structure. Every operation + * on the tree causes a splay to happen. The splay moves the requested + * node to the root of the tree and partly rebalances it. + * + * This has the benefit that request locality causes faster lookups as + * the requested nodes move to the top of the tree. On the other hand, + * every lookup causes memory writes. + * + * The Balance Theorem bounds the total access time for m operations + * and n inserts on an initially empty tree as O((m + n)lg n). The + * amortized cost for a sequence of m accesses to a splay tree is O(lg n); + * + * A red-black tree is a binary search tree with the node color as an + * extra attribute. It fulfills a set of conditions: + * - every search path from the root to a leaf consists of the + * same number of black nodes, + * - each red node (except for the root) has a black parent, + * - each leaf node is black. + * + * Every operation on a red-black tree is bounded as O(lg n). + * The maximum height of a red-black tree is 2lg (n+1). + */ + +#define SPLAY_HEAD(name, type) \ +struct name { \ + struct type *sph_root; /* root of the tree */ \ +} + +#define SPLAY_INITIALIZER(root) \ + { NULL } + +#define SPLAY_INIT(root) do { \ + (root)->sph_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ENTRY(type) \ +struct { \ + struct type *spe_left; /* left element */ \ + struct type *spe_right; /* right element */ \ +} + +#define SPLAY_LEFT(elm, field) (elm)->field.spe_left +#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right +#define SPLAY_ROOT(head) (head)->sph_root +#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) + +/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ +#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKLEFT(head, tmp, field) do { \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKRIGHT(head, tmp, field) do { \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ + SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ + SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ + +#define SPLAY_PROTOTYPE(name, type, field, cmp) \ +void name##_SPLAY(struct name *, struct type *); \ +void name##_SPLAY_MINMAX(struct name *, int); \ +struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ +struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ + \ +/* Finds the node with the same key as elm */ \ +static __inline struct type * \ +name##_SPLAY_FIND(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) \ + return(NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) \ + return (head->sph_root); \ + return (NULL); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_NEXT(struct name *head, struct type *elm) \ +{ \ + name##_SPLAY(head, elm); \ + if (SPLAY_RIGHT(elm, field) != NULL) { \ + elm = SPLAY_RIGHT(elm, field); \ + while (SPLAY_LEFT(elm, field) != NULL) { \ + elm = SPLAY_LEFT(elm, field); \ + } \ + } else \ + elm = NULL; \ + return (elm); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_MIN_MAX(struct name *head, int val) \ +{ \ + name##_SPLAY_MINMAX(head, val); \ + return (SPLAY_ROOT(head)); \ +} + +/* Main splay operation. + * Moves node close to the key of elm to top + */ +#define SPLAY_GENERATE(name, type, field, cmp) \ +struct type * \ +name##_SPLAY_INSERT(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) { \ + SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ + } else { \ + int __comp; \ + name##_SPLAY(head, elm); \ + __comp = (cmp)(elm, (head)->sph_root); \ + if(__comp < 0) { \ + SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ + SPLAY_RIGHT(elm, field) = (head)->sph_root; \ + SPLAY_LEFT((head)->sph_root, field) = NULL; \ + } else if (__comp > 0) { \ + SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT(elm, field) = (head)->sph_root; \ + SPLAY_RIGHT((head)->sph_root, field) = NULL; \ + } else \ + return ((head)->sph_root); \ + } \ + (head)->sph_root = (elm); \ + return (NULL); \ +} \ + \ +struct type * \ +name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *__tmp; \ + if (SPLAY_EMPTY(head)) \ + return (NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) { \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ + } else { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ + name##_SPLAY(head, elm); \ + SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ + } \ + return (elm); \ + } \ + return (NULL); \ +} \ + \ +void \ +name##_SPLAY(struct name *head, struct type *elm) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ + int __comp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) > 0){ \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} \ + \ +/* Splay with either the minimum or the maximum element \ + * Used to find minimum or maximum element in tree. \ + */ \ +void name##_SPLAY_MINMAX(struct name *head, int __comp) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while (1) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp > 0) { \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} + +#define SPLAY_NEGINF -1 +#define SPLAY_INF 1 + +#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) +#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) +#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) +#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) +#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) +#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) + +#define SPLAY_FOREACH(x, name, head) \ + for ((x) = SPLAY_MIN(name, head); \ + (x) != NULL; \ + (x) = SPLAY_NEXT(name, head, x)) + +/* Macros that define a red-black tree */ +#define RB_HEAD(name, type) \ +struct name { \ + struct type *rbh_root; /* root of the tree */ \ +} + +#define RB_INITIALIZER(root) \ + { NULL } + +#define RB_INIT(root) do { \ + (root)->rbh_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +#define RB_BLACK 0 +#define RB_RED 1 +#define RB_ENTRY(type) \ +struct { \ + struct type *rbe_left; /* left element */ \ + struct type *rbe_right; /* right element */ \ + struct type *rbe_parent; /* parent element */ \ + int rbe_color; /* node color */ \ +} + +#define RB_LEFT(elm, field) (elm)->field.rbe_left +#define RB_RIGHT(elm, field) (elm)->field.rbe_right +#define RB_PARENT(elm, field) (elm)->field.rbe_parent +#define RB_COLOR(elm, field) (elm)->field.rbe_color +#define RB_ROOT(head) (head)->rbh_root +#define RB_EMPTY(head) (RB_ROOT(head) == NULL) + +#define RB_SET(elm, parent, field) do { \ + RB_PARENT(elm, field) = parent; \ + RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ + RB_COLOR(elm, field) = RB_RED; \ +} while (/*CONSTCOND*/ 0) + +#define RB_SET_BLACKRED(black, red, field) do { \ + RB_COLOR(black, field) = RB_BLACK; \ + RB_COLOR(red, field) = RB_RED; \ +} while (/*CONSTCOND*/ 0) + +#ifndef RB_AUGMENT +#define RB_AUGMENT(x) do {} while (0) +#endif + +#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ + (tmp) = RB_RIGHT(elm, field); \ + if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ + RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_LEFT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (/*CONSTCOND*/ 0) + +#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ + (tmp) = RB_LEFT(elm, field); \ + if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ + RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_RIGHT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ +#define RB_PROTOTYPE(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) +#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __unused static) +#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ +attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ +attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ +attr struct type *name##_RB_INSERT(struct name *, struct type *); \ +attr struct type *name##_RB_FIND(struct name *, struct type *); \ +attr struct type *name##_RB_NFIND(struct name *, struct type *); \ +attr struct type *name##_RB_NEXT(struct type *); \ +attr struct type *name##_RB_PREV(struct type *); \ +attr struct type *name##_RB_MINMAX(struct name *, int); \ + \ + +/* Main rb operation. + * Moves node close to the key of elm to top + */ +#define RB_GENERATE(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp,) +#define RB_GENERATE_STATIC(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp, __unused static) +#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ +attr void \ +name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ +{ \ + struct type *parent, *gparent, *tmp; \ + while ((parent = RB_PARENT(elm, field)) != NULL && \ + RB_COLOR(parent, field) == RB_RED) { \ + gparent = RB_PARENT(parent, field); \ + if (parent == RB_LEFT(gparent, field)) { \ + tmp = RB_RIGHT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_RIGHT(parent, field) == elm) { \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_RIGHT(head, gparent, tmp, field); \ + } else { \ + tmp = RB_LEFT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_LEFT(parent, field) == elm) { \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_LEFT(head, gparent, tmp, field); \ + } \ + } \ + RB_COLOR(head->rbh_root, field) = RB_BLACK; \ +} \ + \ +attr void \ +name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ +{ \ + struct type *tmp; \ + while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ + elm != RB_ROOT(head)) { \ + if (RB_LEFT(parent, field) == elm) { \ + tmp = RB_RIGHT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ + struct type *oleft; \ + if ((oleft = RB_LEFT(tmp, field)) \ + != NULL) \ + RB_COLOR(oleft, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_RIGHT(head, tmp, oleft, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_RIGHT(tmp, field)) \ + RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } else { \ + tmp = RB_LEFT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ + struct type *oright; \ + if ((oright = RB_RIGHT(tmp, field)) \ + != NULL) \ + RB_COLOR(oright, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_LEFT(head, tmp, oright, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_LEFT(tmp, field)) \ + RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } \ + } \ + if (elm) \ + RB_COLOR(elm, field) = RB_BLACK; \ +} \ + \ +attr struct type * \ +name##_RB_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *child, *parent, *old = elm; \ + int color; \ + if (RB_LEFT(elm, field) == NULL) \ + child = RB_RIGHT(elm, field); \ + else if (RB_RIGHT(elm, field) == NULL) \ + child = RB_LEFT(elm, field); \ + else { \ + struct type *left; \ + elm = RB_RIGHT(elm, field); \ + while ((left = RB_LEFT(elm, field)) != NULL) \ + elm = left; \ + child = RB_RIGHT(elm, field); \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ + if (RB_PARENT(elm, field) == old) \ + parent = elm; \ + (elm)->field = (old)->field; \ + if (RB_PARENT(old, field)) { \ + if (RB_LEFT(RB_PARENT(old, field), field) == old)\ + RB_LEFT(RB_PARENT(old, field), field) = elm;\ + else \ + RB_RIGHT(RB_PARENT(old, field), field) = elm;\ + RB_AUGMENT(RB_PARENT(old, field)); \ + } else \ + RB_ROOT(head) = elm; \ + RB_PARENT(RB_LEFT(old, field), field) = elm; \ + if (RB_RIGHT(old, field)) \ + RB_PARENT(RB_RIGHT(old, field), field) = elm; \ + if (parent) { \ + left = parent; \ + do { \ + RB_AUGMENT(left); \ + } while ((left = RB_PARENT(left, field)) != NULL); \ + } \ + goto color; \ + } \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ +color: \ + if (color == RB_BLACK) \ + name##_RB_REMOVE_COLOR(head, parent, child); \ + return (old); \ +} \ + \ +/* Inserts a node into the RB tree */ \ +attr struct type * \ +name##_RB_INSERT(struct name *head, struct type *elm) \ +{ \ + struct type *tmp; \ + struct type *parent = NULL; \ + int comp = 0; \ + tmp = RB_ROOT(head); \ + while (tmp) { \ + parent = tmp; \ + comp = (cmp)(elm, parent); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + RB_SET(elm, parent, field); \ + if (parent != NULL) { \ + if (comp < 0) \ + RB_LEFT(parent, field) = elm; \ + else \ + RB_RIGHT(parent, field) = elm; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_INSERT_COLOR(head, elm); \ + return (NULL); \ +} \ + \ +/* Finds the node with the same key as elm */ \ +attr struct type * \ +name##_RB_FIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (NULL); \ +} \ + \ +/* Finds the first node greater than or equal to the search key */ \ +attr struct type * \ +name##_RB_NFIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *res = NULL; \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) { \ + res = tmp; \ + tmp = RB_LEFT(tmp, field); \ + } \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (res); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_NEXT(struct type *elm) \ +{ \ + if (RB_RIGHT(elm, field)) { \ + elm = RB_RIGHT(elm, field); \ + while (RB_LEFT(elm, field)) \ + elm = RB_LEFT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_PREV(struct type *elm) \ +{ \ + if (RB_LEFT(elm, field)) { \ + elm = RB_LEFT(elm, field); \ + while (RB_RIGHT(elm, field)) \ + elm = RB_RIGHT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +attr struct type * \ +name##_RB_MINMAX(struct name *head, int val) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *parent = NULL; \ + while (tmp) { \ + parent = tmp; \ + if (val < 0) \ + tmp = RB_LEFT(tmp, field); \ + else \ + tmp = RB_RIGHT(tmp, field); \ + } \ + return (parent); \ +} + +#define RB_NEGINF -1 +#define RB_INF 1 + +#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) +#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) +#define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) +#define RB_NEXT(name, x, y) name##_RB_NEXT(y) +#define RB_PREV(name, x, y) name##_RB_PREV(y) +#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) +#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) + +#define RB_FOREACH(x, name, head) \ + for ((x) = RB_MIN(name, head); \ + (x) != NULL; \ + (x) = name##_RB_NEXT(x)) + +#define RB_FOREACH_FROM(x, name, y) \ + for ((x) = (y); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ + (x) = (y)) + +#define RB_FOREACH_SAFE(x, name, head, y) \ + for ((x) = RB_MIN(name, head); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ + (x) = (y)) + +#define RB_FOREACH_REVERSE(x, name, head) \ + for ((x) = RB_MAX(name, head); \ + (x) != NULL; \ + (x) = name##_RB_PREV(x)) + +#define RB_FOREACH_REVERSE_FROM(x, name, y) \ + for ((x) = (y); \ + ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \ + (x) = (y)) + +#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \ + for ((x) = RB_MAX(name, head); \ + ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \ + (x) = (y)) + +#endif /* _SYS_TREE_H_ */ diff --git a/usr/contrib/freebsd/x86/apicreg.h b/usr/contrib/freebsd/x86/apicreg.h new file mode 100644 index 0000000000..24006e2733 --- /dev/null +++ b/usr/contrib/freebsd/x86/apicreg.h @@ -0,0 +1,455 @@ +/*- + * Copyright (c) 1996, by Peter Wemm and Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/x86/include/apicreg.h 259140 2013-12-09 21:08:52Z jhb $ + */ + +#ifndef _X86_APICREG_H_ +#define _X86_APICREG_H_ + +/* + * Local && I/O APIC definitions. + */ + +/* + * Pentium P54C+ Built-in APIC + * (Advanced programmable Interrupt Controller) + * + * Base Address of Built-in APIC in memory location + * is 0xfee00000. + * + * Map of APIC Registers: + * + * Offset (hex) Description Read/Write state + * 000 Reserved + * 010 Reserved + * 020 ID Local APIC ID R/W + * 030 VER Local APIC Version R + * 040 Reserved + * 050 Reserved + * 060 Reserved + * 070 Reserved + * 080 Task Priority Register R/W + * 090 Arbitration Priority Register R + * 0A0 Processor Priority Register R + * 0B0 EOI Register W + * 0C0 RRR Remote read R + * 0D0 Logical Destination R/W + * 0E0 Destination Format Register 0..27 R; 28..31 R/W + * 0F0 SVR Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W + * 100 ISR 000-031 R + * 110 ISR 032-063 R + * 120 ISR 064-095 R + * 130 ISR 095-128 R + * 140 ISR 128-159 R + * 150 ISR 160-191 R + * 160 ISR 192-223 R + * 170 ISR 224-255 R + * 180 TMR 000-031 R + * 190 TMR 032-063 R + * 1A0 TMR 064-095 R + * 1B0 TMR 095-128 R + * 1C0 TMR 128-159 R + * 1D0 TMR 160-191 R + * 1E0 TMR 192-223 R + * 1F0 TMR 224-255 R + * 200 IRR 000-031 R + * 210 IRR 032-063 R + * 220 IRR 064-095 R + * 230 IRR 095-128 R + * 240 IRR 128-159 R + * 250 IRR 160-191 R + * 260 IRR 192-223 R + * 270 IRR 224-255 R + * 280 Error Status Register R + * 290 Reserved + * 2A0 Reserved + * 2B0 Reserved + * 2C0 Reserved + * 2D0 Reserved + * 2E0 Reserved + * 2F0 Local Vector Table (CMCI) R/W + * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W + * 310 ICR_HI Interrupt Command Reg. (32-63) R/W + * 320 Local Vector Table (Timer) R/W + * 330 Local Vector Table (Thermal) R/W (PIV+) + * 340 Local Vector Table (Performance) R/W (P6+) + * 350 LVT1 Local Vector Table (LINT0) R/W + * 360 LVT2 Local Vector Table (LINT1) R/W + * 370 LVT3 Local Vector Table (ERROR) R/W + * 380 Initial Count Reg. for Timer R/W + * 390 Current Count of Timer R + * 3A0 Reserved + * 3B0 Reserved + * 3C0 Reserved + * 3D0 Reserved + * 3E0 Timer Divide Configuration Reg. R/W + * 3F0 Reserved + */ + + +/****************************************************************************** + * global defines, etc. + */ + + +/****************************************************************************** + * LOCAL APIC structure + */ + +#ifndef LOCORE +#include <sys/types.h> + +#define PAD3 int : 32; int : 32; int : 32 +#define PAD4 int : 32; int : 32; int : 32; int : 32 + +struct LAPIC { + /* reserved */ PAD4; + /* reserved */ PAD4; + u_int32_t id; PAD3; + u_int32_t version; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + u_int32_t tpr; PAD3; + u_int32_t apr; PAD3; + u_int32_t ppr; PAD3; + u_int32_t eoi; PAD3; + /* reserved */ PAD4; + u_int32_t ldr; PAD3; + u_int32_t dfr; PAD3; + u_int32_t svr; PAD3; + u_int32_t isr0; PAD3; + u_int32_t isr1; PAD3; + u_int32_t isr2; PAD3; + u_int32_t isr3; PAD3; + u_int32_t isr4; PAD3; + u_int32_t isr5; PAD3; + u_int32_t isr6; PAD3; + u_int32_t isr7; PAD3; + u_int32_t tmr0; PAD3; + u_int32_t tmr1; PAD3; + u_int32_t tmr2; PAD3; + u_int32_t tmr3; PAD3; + u_int32_t tmr4; PAD3; + u_int32_t tmr5; PAD3; + u_int32_t tmr6; PAD3; + u_int32_t tmr7; PAD3; + u_int32_t irr0; PAD3; + u_int32_t irr1; PAD3; + u_int32_t irr2; PAD3; + u_int32_t irr3; PAD3; + u_int32_t irr4; PAD3; + u_int32_t irr5; PAD3; + u_int32_t irr6; PAD3; + u_int32_t irr7; PAD3; + u_int32_t esr; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + u_int32_t lvt_cmci; PAD3; + u_int32_t icr_lo; PAD3; + u_int32_t icr_hi; PAD3; + u_int32_t lvt_timer; PAD3; + u_int32_t lvt_thermal; PAD3; + u_int32_t lvt_pcint; PAD3; + u_int32_t lvt_lint0; PAD3; + u_int32_t lvt_lint1; PAD3; + u_int32_t lvt_error; PAD3; + u_int32_t icr_timer; PAD3; + u_int32_t ccr_timer; PAD3; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + u_int32_t dcr_timer; PAD3; + /* reserved */ PAD4; +}; + +typedef struct LAPIC lapic_t; + +/****************************************************************************** + * I/O APIC structure + */ + +struct IOAPIC { + u_int32_t ioregsel; PAD3; + u_int32_t iowin; PAD3; +}; + +typedef struct IOAPIC ioapic_t; + +#undef PAD4 +#undef PAD3 + +#endif /* !LOCORE */ + + +/****************************************************************************** + * various code 'logical' values + */ + +/****************************************************************************** + * LOCAL APIC defines + */ + +/* default physical locations of LOCAL (CPU) APICs */ +#define DEFAULT_APIC_BASE 0xfee00000 + +/* constants relating to APIC ID registers */ +#define APIC_ID_MASK 0xff000000 +#define APIC_ID_SHIFT 24 +#define APIC_ID_CLUSTER 0xf0 +#define APIC_ID_CLUSTER_ID 0x0f +#define APIC_MAX_CLUSTER 0xe +#define APIC_MAX_INTRACLUSTER_ID 3 +#define APIC_ID_CLUSTER_SHIFT 4 + +/* fields in VER */ +#define APIC_VER_VERSION 0x000000ff +#define APIC_VER_MAXLVT 0x00ff0000 +#define MAXLVTSHIFT 16 +#define APIC_VER_EOI_SUPPRESSION 0x01000000 + +/* fields in LDR */ +#define APIC_LDR_RESERVED 0x00ffffff + +/* fields in DFR */ +#define APIC_DFR_RESERVED 0x0fffffff +#define APIC_DFR_MODEL_MASK 0xf0000000 +#define APIC_DFR_MODEL_FLAT 0xf0000000 +#define APIC_DFR_MODEL_CLUSTER 0x00000000 + +/* fields in SVR */ +#define APIC_SVR_VECTOR 0x000000ff +#define APIC_SVR_VEC_PROG 0x000000f0 +#define APIC_SVR_VEC_FIX 0x0000000f +#define APIC_SVR_ENABLE 0x00000100 +# define APIC_SVR_SWDIS 0x00000000 +# define APIC_SVR_SWEN 0x00000100 +#define APIC_SVR_FOCUS 0x00000200 +# define APIC_SVR_FEN 0x00000000 +# define APIC_SVR_FDIS 0x00000200 +#define APIC_SVR_EOI_SUPPRESSION 0x00001000 + +/* fields in TPR */ +#define APIC_TPR_PRIO 0x000000ff +# define APIC_TPR_INT 0x000000f0 +# define APIC_TPR_SUB 0x0000000f + +/* fields in ESR */ +#define APIC_ESR_SEND_CS_ERROR 0x00000001 +#define APIC_ESR_RECEIVE_CS_ERROR 0x00000002 +#define APIC_ESR_SEND_ACCEPT 0x00000004 +#define APIC_ESR_RECEIVE_ACCEPT 0x00000008 +#define APIC_ESR_SEND_ILLEGAL_VECTOR 0x00000020 +#define APIC_ESR_RECEIVE_ILLEGAL_VECTOR 0x00000040 +#define APIC_ESR_ILLEGAL_REGISTER 0x00000080 + +/* fields in ICR_LOW */ +#define APIC_VECTOR_MASK 0x000000ff + +#define APIC_DELMODE_MASK 0x00000700 +# define APIC_DELMODE_FIXED 0x00000000 +# define APIC_DELMODE_LOWPRIO 0x00000100 +# define APIC_DELMODE_SMI 0x00000200 +# define APIC_DELMODE_RR 0x00000300 +# define APIC_DELMODE_NMI 0x00000400 +# define APIC_DELMODE_INIT 0x00000500 +# define APIC_DELMODE_STARTUP 0x00000600 +# define APIC_DELMODE_RESV 0x00000700 + +#define APIC_DESTMODE_MASK 0x00000800 +# define APIC_DESTMODE_PHY 0x00000000 +# define APIC_DESTMODE_LOG 0x00000800 + +#define APIC_DELSTAT_MASK 0x00001000 +# define APIC_DELSTAT_IDLE 0x00000000 +# define APIC_DELSTAT_PEND 0x00001000 + +#define APIC_RESV1_MASK 0x00002000 + +#define APIC_LEVEL_MASK 0x00004000 +# define APIC_LEVEL_DEASSERT 0x00000000 +# define APIC_LEVEL_ASSERT 0x00004000 + +#define APIC_TRIGMOD_MASK 0x00008000 +# define APIC_TRIGMOD_EDGE 0x00000000 +# define APIC_TRIGMOD_LEVEL 0x00008000 + +#define APIC_RRSTAT_MASK 0x00030000 +# define APIC_RRSTAT_INVALID 0x00000000 +# define APIC_RRSTAT_INPROG 0x00010000 +# define APIC_RRSTAT_VALID 0x00020000 +# define APIC_RRSTAT_RESV 0x00030000 + +#define APIC_DEST_MASK 0x000c0000 +# define APIC_DEST_DESTFLD 0x00000000 +# define APIC_DEST_SELF 0x00040000 +# define APIC_DEST_ALLISELF 0x00080000 +# define APIC_DEST_ALLESELF 0x000c0000 + +#define APIC_RESV2_MASK 0xfff00000 + +#define APIC_ICRLO_RESV_MASK (APIC_RESV1_MASK | APIC_RESV2_MASK) + +/* fields in LVT1/2 */ +#define APIC_LVT_VECTOR 0x000000ff +#define APIC_LVT_DM 0x00000700 +# define APIC_LVT_DM_FIXED 0x00000000 +# define APIC_LVT_DM_SMI 0x00000200 +# define APIC_LVT_DM_NMI 0x00000400 +# define APIC_LVT_DM_INIT 0x00000500 +# define APIC_LVT_DM_EXTINT 0x00000700 +#define APIC_LVT_DS 0x00001000 +#define APIC_LVT_IIPP 0x00002000 +#define APIC_LVT_IIPP_INTALO 0x00002000 +#define APIC_LVT_IIPP_INTAHI 0x00000000 +#define APIC_LVT_RIRR 0x00004000 +#define APIC_LVT_TM 0x00008000 +#define APIC_LVT_M 0x00010000 + + +/* fields in LVT Timer */ +#define APIC_LVTT_VECTOR 0x000000ff +#define APIC_LVTT_DS 0x00001000 +#define APIC_LVTT_M 0x00010000 +#define APIC_LVTT_TM 0x00020000 +# define APIC_LVTT_TM_ONE_SHOT 0x00000000 +# define APIC_LVTT_TM_PERIODIC 0x00020000 + + +/* APIC timer current count */ +#define APIC_TIMER_MAX_COUNT 0xffffffff + +/* fields in TDCR */ +#define APIC_TDCR_2 0x00 +#define APIC_TDCR_4 0x01 +#define APIC_TDCR_8 0x02 +#define APIC_TDCR_16 0x03 +#define APIC_TDCR_32 0x08 +#define APIC_TDCR_64 0x09 +#define APIC_TDCR_128 0x0a +#define APIC_TDCR_1 0x0b + +/* LVT table indices */ +#define APIC_LVT_LINT0 0 +#define APIC_LVT_LINT1 1 +#define APIC_LVT_TIMER 2 +#define APIC_LVT_ERROR 3 +#define APIC_LVT_PMC 4 +#define APIC_LVT_THERMAL 5 +#define APIC_LVT_CMCI 6 +#define APIC_LVT_MAX APIC_LVT_CMCI + +/****************************************************************************** + * I/O APIC defines + */ + +/* default physical locations of an IO APIC */ +#define DEFAULT_IO_APIC_BASE 0xfec00000 + +/* window register offset */ +#define IOAPIC_WINDOW 0x10 +#define IOAPIC_EOIR 0x40 + +/* indexes into IO APIC */ +#define IOAPIC_ID 0x00 +#define IOAPIC_VER 0x01 +#define IOAPIC_ARB 0x02 +#define IOAPIC_REDTBL 0x10 +#define IOAPIC_REDTBL0 IOAPIC_REDTBL +#define IOAPIC_REDTBL1 (IOAPIC_REDTBL+0x02) +#define IOAPIC_REDTBL2 (IOAPIC_REDTBL+0x04) +#define IOAPIC_REDTBL3 (IOAPIC_REDTBL+0x06) +#define IOAPIC_REDTBL4 (IOAPIC_REDTBL+0x08) +#define IOAPIC_REDTBL5 (IOAPIC_REDTBL+0x0a) +#define IOAPIC_REDTBL6 (IOAPIC_REDTBL+0x0c) +#define IOAPIC_REDTBL7 (IOAPIC_REDTBL+0x0e) +#define IOAPIC_REDTBL8 (IOAPIC_REDTBL+0x10) +#define IOAPIC_REDTBL9 (IOAPIC_REDTBL+0x12) +#define IOAPIC_REDTBL10 (IOAPIC_REDTBL+0x14) +#define IOAPIC_REDTBL11 (IOAPIC_REDTBL+0x16) +#define IOAPIC_REDTBL12 (IOAPIC_REDTBL+0x18) +#define IOAPIC_REDTBL13 (IOAPIC_REDTBL+0x1a) +#define IOAPIC_REDTBL14 (IOAPIC_REDTBL+0x1c) +#define IOAPIC_REDTBL15 (IOAPIC_REDTBL+0x1e) +#define IOAPIC_REDTBL16 (IOAPIC_REDTBL+0x20) +#define IOAPIC_REDTBL17 (IOAPIC_REDTBL+0x22) +#define IOAPIC_REDTBL18 (IOAPIC_REDTBL+0x24) +#define IOAPIC_REDTBL19 (IOAPIC_REDTBL+0x26) +#define IOAPIC_REDTBL20 (IOAPIC_REDTBL+0x28) +#define IOAPIC_REDTBL21 (IOAPIC_REDTBL+0x2a) +#define IOAPIC_REDTBL22 (IOAPIC_REDTBL+0x2c) +#define IOAPIC_REDTBL23 (IOAPIC_REDTBL+0x2e) + +/* fields in VER */ +#define IOART_VER_VERSION 0x000000ff +#define IOART_VER_MAXREDIR 0x00ff0000 +#define MAXREDIRSHIFT 16 + +/* + * fields in the IO APIC's redirection table entries + */ +#define IOART_DEST APIC_ID_MASK /* broadcast addr: all APICs */ + +#define IOART_RESV 0x00fe0000 /* reserved */ + +#define IOART_INTMASK 0x00010000 /* R/W: INTerrupt mask */ +# define IOART_INTMCLR 0x00000000 /* clear, allow INTs */ +# define IOART_INTMSET 0x00010000 /* set, inhibit INTs */ + +#define IOART_TRGRMOD 0x00008000 /* R/W: trigger mode */ +# define IOART_TRGREDG 0x00000000 /* edge */ +# define IOART_TRGRLVL 0x00008000 /* level */ + +#define IOART_REM_IRR 0x00004000 /* RO: remote IRR */ + +#define IOART_INTPOL 0x00002000 /* R/W: INT input pin polarity */ +# define IOART_INTAHI 0x00000000 /* active high */ +# define IOART_INTALO 0x00002000 /* active low */ + +#define IOART_DELIVS 0x00001000 /* RO: delivery status */ + +#define IOART_DESTMOD 0x00000800 /* R/W: destination mode */ +# define IOART_DESTPHY 0x00000000 /* physical */ +# define IOART_DESTLOG 0x00000800 /* logical */ + +#define IOART_DELMOD 0x00000700 /* R/W: delivery mode */ +# define IOART_DELFIXED 0x00000000 /* fixed */ +# define IOART_DELLOPRI 0x00000100 /* lowest priority */ +# define IOART_DELSMI 0x00000200 /* System Management INT */ +# define IOART_DELRSV1 0x00000300 /* reserved */ +# define IOART_DELNMI 0x00000400 /* NMI signal */ +# define IOART_DELINIT 0x00000500 /* INIT signal */ +# define IOART_DELRSV2 0x00000600 /* reserved */ +# define IOART_DELEXINT 0x00000700 /* External INTerrupt */ + +#define IOART_INTVEC 0x000000ff /* R/W: INTerrupt vector field */ + +#endif /* _X86_APICREG_H_ */ diff --git a/usr/contrib/freebsd/x86/mptable.h b/usr/contrib/freebsd/x86/mptable.h new file mode 100644 index 0000000000..8f3c62a295 --- /dev/null +++ b/usr/contrib/freebsd/x86/mptable.h @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/x86/include/mptable.h 259228 2013-12-11 21:19:04Z jhb $ + */ + +#ifndef __MACHINE_MPTABLE_H__ +#define __MACHINE_MPTABLE_H__ + +enum busTypes { + NOBUS = 0, + CBUS = 1, + CBUSII = 2, + EISA = 3, + ISA = 6, + MCA = 9, + PCI = 13, + XPRESS = 18, + MAX_BUSTYPE = 18, + UNKNOWN_BUSTYPE = 0xff +}; + +/* MP Floating Pointer Structure */ +typedef struct MPFPS { + uint8_t signature[4]; + uint32_t pap; + uint8_t length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t config_type; + uint8_t mpfb2; + uint8_t mpfb3; + uint8_t mpfb4; + uint8_t mpfb5; +} __packed *mpfps_t; + +#define MPFB2_IMCR_PRESENT 0x80 +#define MPFB2_MUL_CLK_SRCS 0x40 + +/* MP Configuration Table Header */ +typedef struct MPCTH { + uint8_t signature[4]; + uint16_t base_table_length; + uint8_t spec_rev; + uint8_t checksum; + uint8_t oem_id[8]; + uint8_t product_id[12]; + uint32_t oem_table_pointer; + uint16_t oem_table_size; + uint16_t entry_count; + uint32_t apic_address; + uint16_t extended_table_length; + uint8_t extended_table_checksum; + uint8_t reserved; +} __packed *mpcth_t; + +/* Base table entries */ + +#define MPCT_ENTRY_PROCESSOR 0 +#define MPCT_ENTRY_BUS 1 +#define MPCT_ENTRY_IOAPIC 2 +#define MPCT_ENTRY_INT 3 +#define MPCT_ENTRY_LOCAL_INT 4 + +typedef struct PROCENTRY { + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t cpu_flags; + uint32_t cpu_signature; + uint32_t feature_flags; + uint32_t reserved1; + uint32_t reserved2; +} __packed *proc_entry_ptr; + +#define PROCENTRY_FLAG_EN 0x01 +#define PROCENTRY_FLAG_BP 0x02 + +typedef struct BUSENTRY { + uint8_t type; + uint8_t bus_id; + uint8_t bus_type[6]; +} __packed *bus_entry_ptr; + +typedef struct IOAPICENTRY { + uint8_t type; + uint8_t apic_id; + uint8_t apic_version; + uint8_t apic_flags; + uint32_t apic_address; +} __packed *io_apic_entry_ptr; + +#define IOAPICENTRY_FLAG_EN 0x01 + +typedef struct INTENTRY { + uint8_t type; + uint8_t int_type; + uint16_t int_flags; + uint8_t src_bus_id; + uint8_t src_bus_irq; + uint8_t dst_apic_id; + uint8_t dst_apic_int; +} __packed *int_entry_ptr; + +#define INTENTRY_TYPE_INT 0 +#define INTENTRY_TYPE_NMI 1 +#define INTENTRY_TYPE_SMI 2 +#define INTENTRY_TYPE_EXTINT 3 + +#define INTENTRY_FLAGS_POLARITY 0x3 +#define INTENTRY_FLAGS_POLARITY_CONFORM 0x0 +#define INTENTRY_FLAGS_POLARITY_ACTIVEHI 0x1 +#define INTENTRY_FLAGS_POLARITY_ACTIVELO 0x3 +#define INTENTRY_FLAGS_TRIGGER 0xc +#define INTENTRY_FLAGS_TRIGGER_CONFORM 0x0 +#define INTENTRY_FLAGS_TRIGGER_EDGE 0x4 +#define INTENTRY_FLAGS_TRIGGER_LEVEL 0xc + +/* Extended table entries */ + +typedef struct EXTENTRY { + uint8_t type; + uint8_t length; +} __packed *ext_entry_ptr; + +#define MPCT_EXTENTRY_SAS 0x80 +#define MPCT_EXTENTRY_BHD 0x81 +#define MPCT_EXTENTRY_CBASM 0x82 + +typedef struct SASENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_type; + uint64_t address_base; + uint64_t address_length; +} __packed *sas_entry_ptr; + +#define SASENTRY_TYPE_IO 0 +#define SASENTRY_TYPE_MEMORY 1 +#define SASENTRY_TYPE_PREFETCH 2 + +typedef struct BHDENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t bus_info; + uint8_t parent_bus; + uint8_t reserved[3]; +} __packed *bhd_entry_ptr; + +#define BHDENTRY_INFO_SUBTRACTIVE_DECODE 0x1 + +typedef struct CBASMENTRY { + uint8_t type; + uint8_t length; + uint8_t bus_id; + uint8_t address_mod; + uint32_t predefined_range; +} __packed *cbasm_entry_ptr; + +#define CBASMENTRY_ADDRESS_MOD_ADD 0x0 +#define CBASMENTRY_ADDRESS_MOD_SUBTRACT 0x1 + +#define CBASMENTRY_RANGE_ISA_IO 0 +#define CBASMENTRY_RANGE_VGA_IO 1 + +#ifdef _KERNEL +struct mptable_hostb_softc { +#ifdef NEW_PCIB + struct pcib_host_resources sc_host_res; + int sc_decodes_vga_io; + int sc_decodes_isa_io; +#endif +}; + +#ifdef NEW_PCIB +void mptable_pci_host_res_init(device_t pcib); +#endif +int mptable_pci_probe_table(int bus); +int mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin); +#endif +#endif /* !__MACHINE_MPTABLE_H__ */ diff --git a/usr/contrib/freebsd/x86/psl.h b/usr/contrib/freebsd/x86/psl.h new file mode 100644 index 0000000000..6934b4feb7 --- /dev/null +++ b/usr/contrib/freebsd/x86/psl.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)psl.h 5.2 (Berkeley) 1/18/91 + * $FreeBSD: head/sys/x86/include/psl.h 258135 2013-11-14 15:37:20Z emaste $ + */ + +#ifndef _MACHINE_PSL_H_ +#define _MACHINE_PSL_H_ + +/* + * 386 processor status longword. + */ +#define PSL_C 0x00000001 /* carry bit */ +#define PSL_PF 0x00000004 /* parity bit */ +#define PSL_AF 0x00000010 /* bcd carry bit */ +#define PSL_Z 0x00000040 /* zero bit */ +#define PSL_N 0x00000080 /* negative bit */ +#define PSL_T 0x00000100 /* trace enable bit */ +#define PSL_I 0x00000200 /* interrupt enable bit */ +#define PSL_D 0x00000400 /* string instruction direction bit */ +#define PSL_V 0x00000800 /* overflow bit */ +#define PSL_IOPL 0x00003000 /* i/o privilege level */ +#define PSL_NT 0x00004000 /* nested task bit */ +#define PSL_RF 0x00010000 /* resume flag bit */ +#define PSL_VM 0x00020000 /* virtual 8086 mode bit */ +#define PSL_AC 0x00040000 /* alignment checking */ +#define PSL_VIF 0x00080000 /* virtual interrupt enable */ +#define PSL_VIP 0x00100000 /* virtual interrupt pending */ +#define PSL_ID 0x00200000 /* identification bit */ + +/* + * The i486 manual says that we are not supposed to change reserved flags, + * but this is too much trouble since the reserved flags depend on the cpu + * and setting them to their historical values works in practice. + */ +#define PSL_RESERVED_DEFAULT 0x00000002 + +/* + * Initial flags for kernel and user mode. The kernel later inherits + * PSL_I and some other flags from user mode. + */ +#define PSL_KERNEL PSL_RESERVED_DEFAULT +#define PSL_USER (PSL_RESERVED_DEFAULT | PSL_I) + +/* + * Bits that can be changed in user mode on 486's. We allow these bits + * to be changed using ptrace(), sigreturn() and procfs. Setting PS_NT + * is undesirable but it may as well be allowed since users can inflict + * it on the kernel directly. Changes to PSL_AC are silently ignored on + * 386's. + * + * Users are allowed to change the privileged flag PSL_RF. The cpu sets PSL_RF + * in tf_eflags for faults. Debuggers should sometimes set it there too. + * tf_eflags is kept in the signal context during signal handling and there is + * no other place to remember it, so the PSL_RF bit may be corrupted by the + * signal handler without us knowing. Corruption of the PSL_RF bit at worst + * causes one more or one less debugger trap, so allowing it is fairly + * harmless. + */ +#define PSL_USERCHANGE (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_T \ + | PSL_D | PSL_V | PSL_NT | PSL_RF | PSL_AC | PSL_ID) + +#endif /* !_MACHINE_PSL_H_ */ diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h new file mode 100644 index 0000000000..bea3122423 --- /dev/null +++ b/usr/contrib/freebsd/x86/specialreg.h @@ -0,0 +1,839 @@ +/*- + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $ + */ + +#ifndef _MACHINE_SPECIALREG_H_ +#define _MACHINE_SPECIALREG_H_ + +/* + * Bits in 386 special registers: + */ +#define CR0_PE 0x00000001 /* Protected mode Enable */ +#define CR0_MP 0x00000002 /* "Math" (fpu) Present */ +#define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ +#define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */ +#define CR0_PG 0x80000000 /* PaGing enable */ + +/* + * Bits in 486 special registers: + */ +#define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ +#define CR0_WP 0x00010000 /* Write Protect (honor page protect in + all modes) */ +#define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */ +#define CR0_NW 0x20000000 /* Not Write-through */ +#define CR0_CD 0x40000000 /* Cache Disable */ + +#define CR3_PCID_SAVE 0x8000000000000000 + +/* + * Bits in PPro special registers + */ +#define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ +#define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ +#define CR4_TSD 0x00000004 /* Time stamp disable */ +#define CR4_DE 0x00000008 /* Debugging extensions */ +#define CR4_PSE 0x00000010 /* Page size extensions */ +#define CR4_PAE 0x00000020 /* Physical address extension */ +#define CR4_MCE 0x00000040 /* Machine check enable */ +#define CR4_PGE 0x00000080 /* Page global enable */ +#define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ +#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ +#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ +#define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ +#define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */ +#define CR4_PCIDE 0x00020000 /* Enable Context ID */ +#define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ +#define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ + +/* + * Bits in AMD64 special registers. EFER is 64 bits wide. + */ +#define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ +#define EFER_LME 0x000000100 /* Long mode enable (R/W) */ +#define EFER_LMA 0x000000400 /* Long mode active (R) */ +#define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ +#define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ + +/* + * Intel Extended Features registers + */ +#define XCR0 0 /* XFEATURE_ENABLED_MASK register */ + +#define XFEATURE_ENABLED_X87 0x00000001 +#define XFEATURE_ENABLED_SSE 0x00000002 +#define XFEATURE_ENABLED_YMM_HI128 0x00000004 +#define XFEATURE_ENABLED_AVX XFEATURE_ENABLED_YMM_HI128 +#define XFEATURE_ENABLED_BNDREGS 0x00000008 +#define XFEATURE_ENABLED_BNDCSR 0x00000010 +#define XFEATURE_ENABLED_OPMASK 0x00000020 +#define XFEATURE_ENABLED_ZMM_HI256 0x00000040 +#define XFEATURE_ENABLED_HI16_ZMM 0x00000080 + +#define XFEATURE_AVX \ + (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX) +#define XFEATURE_AVX512 \ + (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 | \ + XFEATURE_ENABLED_HI16_ZMM) +#define XFEATURE_MPX \ + (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR) + +/* + * CPUID instruction features register + */ +#define CPUID_FPU 0x00000001 +#define CPUID_VME 0x00000002 +#define CPUID_DE 0x00000004 +#define CPUID_PSE 0x00000008 +#define CPUID_TSC 0x00000010 +#define CPUID_MSR 0x00000020 +#define CPUID_PAE 0x00000040 +#define CPUID_MCE 0x00000080 +#define CPUID_CX8 0x00000100 +#define CPUID_APIC 0x00000200 +#define CPUID_B10 0x00000400 +#define CPUID_SEP 0x00000800 +#define CPUID_MTRR 0x00001000 +#define CPUID_PGE 0x00002000 +#define CPUID_MCA 0x00004000 +#define CPUID_CMOV 0x00008000 +#define CPUID_PAT 0x00010000 +#define CPUID_PSE36 0x00020000 +#define CPUID_PSN 0x00040000 +#define CPUID_CLFSH 0x00080000 +#define CPUID_B20 0x00100000 +#define CPUID_DS 0x00200000 +#define CPUID_ACPI 0x00400000 +#define CPUID_MMX 0x00800000 +#define CPUID_FXSR 0x01000000 +#define CPUID_SSE 0x02000000 +#define CPUID_XMM 0x02000000 +#define CPUID_SSE2 0x04000000 +#define CPUID_SS 0x08000000 +#define CPUID_HTT 0x10000000 +#define CPUID_TM 0x20000000 +#define CPUID_IA64 0x40000000 +#define CPUID_PBE 0x80000000 + +#define CPUID2_SSE3 0x00000001 +#define CPUID2_PCLMULQDQ 0x00000002 +#define CPUID2_DTES64 0x00000004 +#define CPUID2_MON 0x00000008 +#define CPUID2_DS_CPL 0x00000010 +#define CPUID2_VMX 0x00000020 +#define CPUID2_SMX 0x00000040 +#define CPUID2_EST 0x00000080 +#define CPUID2_TM2 0x00000100 +#define CPUID2_SSSE3 0x00000200 +#define CPUID2_CNXTID 0x00000400 +#define CPUID2_FMA 0x00001000 +#define CPUID2_CX16 0x00002000 +#define CPUID2_XTPR 0x00004000 +#define CPUID2_PDCM 0x00008000 +#define CPUID2_PCID 0x00020000 +#define CPUID2_DCA 0x00040000 +#define CPUID2_SSE41 0x00080000 +#define CPUID2_SSE42 0x00100000 +#define CPUID2_X2APIC 0x00200000 +#define CPUID2_MOVBE 0x00400000 +#define CPUID2_POPCNT 0x00800000 +#define CPUID2_TSCDLT 0x01000000 +#define CPUID2_AESNI 0x02000000 +#define CPUID2_XSAVE 0x04000000 +#define CPUID2_OSXSAVE 0x08000000 +#define CPUID2_AVX 0x10000000 +#define CPUID2_F16C 0x20000000 +#define CPUID2_RDRAND 0x40000000 +#define CPUID2_HV 0x80000000 + +/* + * Important bits in the Thermal and Power Management flags + * CPUID.6 EAX and ECX. + */ +#define CPUTPM1_SENSOR 0x00000001 +#define CPUTPM1_TURBO 0x00000002 +#define CPUTPM1_ARAT 0x00000004 +#define CPUTPM2_EFFREQ 0x00000001 + +/* + * Important bits in the AMD extended cpuid flags + */ +#define AMDID_SYSCALL 0x00000800 +#define AMDID_MP 0x00080000 +#define AMDID_NX 0x00100000 +#define AMDID_EXT_MMX 0x00400000 +#define AMDID_FFXSR 0x01000000 +#define AMDID_PAGE1GB 0x04000000 +#define AMDID_RDTSCP 0x08000000 +#define AMDID_LM 0x20000000 +#define AMDID_EXT_3DNOW 0x40000000 +#define AMDID_3DNOW 0x80000000 + +#define AMDID2_LAHF 0x00000001 +#define AMDID2_CMP 0x00000002 +#define AMDID2_SVM 0x00000004 +#define AMDID2_EXT_APIC 0x00000008 +#define AMDID2_CR8 0x00000010 +#define AMDID2_ABM 0x00000020 +#define AMDID2_SSE4A 0x00000040 +#define AMDID2_MAS 0x00000080 +#define AMDID2_PREFETCH 0x00000100 +#define AMDID2_OSVW 0x00000200 +#define AMDID2_IBS 0x00000400 +#define AMDID2_XOP 0x00000800 +#define AMDID2_SKINIT 0x00001000 +#define AMDID2_WDT 0x00002000 +#define AMDID2_LWP 0x00008000 +#define AMDID2_FMA4 0x00010000 +#define AMDID2_TCE 0x00020000 +#define AMDID2_NODE_ID 0x00080000 +#define AMDID2_TBM 0x00200000 +#define AMDID2_TOPOLOGY 0x00400000 +#define AMDID2_PCXC 0x00800000 +#define AMDID2_PNXC 0x01000000 +#define AMDID2_DBE 0x04000000 +#define AMDID2_PTSC 0x08000000 +#define AMDID2_PTSCEL2I 0x10000000 + +/* + * CPUID instruction 1 eax info + */ +#define CPUID_STEPPING 0x0000000f +#define CPUID_MODEL 0x000000f0 +#define CPUID_FAMILY 0x00000f00 +#define CPUID_EXT_MODEL 0x000f0000 +#define CPUID_EXT_FAMILY 0x0ff00000 +#ifdef __i386__ +#define CPUID_TO_MODEL(id) \ + ((((id) & CPUID_MODEL) >> 4) | \ + ((((id) & CPUID_FAMILY) >= 0x600) ? \ + (((id) & CPUID_EXT_MODEL) >> 12) : 0)) +#define CPUID_TO_FAMILY(id) \ + ((((id) & CPUID_FAMILY) >> 8) + \ + ((((id) & CPUID_FAMILY) == 0xf00) ? \ + (((id) & CPUID_EXT_FAMILY) >> 20) : 0)) +#else +#define CPUID_TO_MODEL(id) \ + ((((id) & CPUID_MODEL) >> 4) | \ + (((id) & CPUID_EXT_MODEL) >> 12)) +#define CPUID_TO_FAMILY(id) \ + ((((id) & CPUID_FAMILY) >> 8) + \ + (((id) & CPUID_EXT_FAMILY) >> 20)) +#endif + +/* + * CPUID instruction 1 ebx info + */ +#define CPUID_BRAND_INDEX 0x000000ff +#define CPUID_CLFUSH_SIZE 0x0000ff00 +#define CPUID_HTT_CORES 0x00ff0000 +#define CPUID_LOCAL_APIC_ID 0xff000000 + +/* + * CPUID instruction 5 info + */ +#define CPUID5_MON_MIN_SIZE 0x0000ffff /* eax */ +#define CPUID5_MON_MAX_SIZE 0x0000ffff /* ebx */ +#define CPUID5_MON_MWAIT_EXT 0x00000001 /* ecx */ +#define CPUID5_MWAIT_INTRBREAK 0x00000002 /* ecx */ + +/* + * MWAIT cpu power states. Lower 4 bits are sub-states. + */ +#define MWAIT_C0 0xf0 +#define MWAIT_C1 0x00 +#define MWAIT_C2 0x10 +#define MWAIT_C3 0x20 +#define MWAIT_C4 0x30 + +/* + * MWAIT extensions. + */ +/* Interrupt breaks MWAIT even when masked. */ +#define MWAIT_INTRBREAK 0x00000001 + +/* + * CPUID instruction 6 ecx info + */ +#define CPUID_PERF_STAT 0x00000001 +#define CPUID_PERF_BIAS 0x00000008 + +/* + * CPUID instruction 0xb ebx info. + */ +#define CPUID_TYPE_INVAL 0 +#define CPUID_TYPE_SMT 1 +#define CPUID_TYPE_CORE 2 + +/* + * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1 + */ +#define CPUID_EXTSTATE_XSAVEOPT 0x00000001 +#define CPUID_EXTSTATE_XSAVEC 0x00000002 +#define CPUID_EXTSTATE_XINUSE 0x00000004 +#define CPUID_EXTSTATE_XSAVES 0x00000008 + +/* + * AMD extended function 8000_0007h edx info + */ +#define AMDPM_TS 0x00000001 +#define AMDPM_FID 0x00000002 +#define AMDPM_VID 0x00000004 +#define AMDPM_TTP 0x00000008 +#define AMDPM_TM 0x00000010 +#define AMDPM_STC 0x00000020 +#define AMDPM_100MHZ_STEPS 0x00000040 +#define AMDPM_HW_PSTATE 0x00000080 +#define AMDPM_TSC_INVARIANT 0x00000100 +#define AMDPM_CPB 0x00000200 + +/* + * AMD extended function 8000_0008h ecx info + */ +#define AMDID_CMP_CORES 0x000000ff +#define AMDID_COREID_SIZE 0x0000f000 +#define AMDID_COREID_SIZE_SHIFT 12 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info + */ +#define CPUID_STDEXT_FSGSBASE 0x00000001 +#define CPUID_STDEXT_TSC_ADJUST 0x00000002 +#define CPUID_STDEXT_BMI1 0x00000008 +#define CPUID_STDEXT_HLE 0x00000010 +#define CPUID_STDEXT_AVX2 0x00000020 +#define CPUID_STDEXT_SMEP 0x00000080 +#define CPUID_STDEXT_BMI2 0x00000100 +#define CPUID_STDEXT_ERMS 0x00000200 +#define CPUID_STDEXT_INVPCID 0x00000400 +#define CPUID_STDEXT_RTM 0x00000800 +#define CPUID_STDEXT_MPX 0x00004000 +#define CPUID_STDEXT_AVX512F 0x00010000 +#define CPUID_STDEXT_RDSEED 0x00040000 +#define CPUID_STDEXT_ADX 0x00080000 +#define CPUID_STDEXT_SMAP 0x00100000 +#define CPUID_STDEXT_CLFLUSHOPT 0x00800000 +#define CPUID_STDEXT_PROCTRACE 0x02000000 +#define CPUID_STDEXT_AVX512PF 0x04000000 +#define CPUID_STDEXT_AVX512ER 0x08000000 +#define CPUID_STDEXT_AVX512CD 0x10000000 +#define CPUID_STDEXT_SHA 0x20000000 + +/* + * CPUID manufacturers identifiers + */ +#define AMD_VENDOR_ID "AuthenticAMD" +#define CENTAUR_VENDOR_ID "CentaurHauls" +#define CYRIX_VENDOR_ID "CyrixInstead" +#define INTEL_VENDOR_ID "GenuineIntel" +#define NEXGEN_VENDOR_ID "NexGenDriven" +#define NSC_VENDOR_ID "Geode by NSC" +#define RISE_VENDOR_ID "RiseRiseRise" +#define SIS_VENDOR_ID "SiS SiS SiS " +#define TRANSMETA_VENDOR_ID "GenuineTMx86" +#define UMC_VENDOR_ID "UMC UMC UMC " + +/* + * Model-specific registers for the i386 family + */ +#define MSR_P5_MC_ADDR 0x000 +#define MSR_P5_MC_TYPE 0x001 +#define MSR_TSC 0x010 +#define MSR_P5_CESR 0x011 +#define MSR_P5_CTR0 0x012 +#define MSR_P5_CTR1 0x013 +#define MSR_IA32_PLATFORM_ID 0x017 +#define MSR_APICBASE 0x01b +#define MSR_EBL_CR_POWERON 0x02a +#define MSR_TEST_CTL 0x033 +#define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_BIOS_UPDT_TRIG 0x079 +#define MSR_BBL_CR_D0 0x088 +#define MSR_BBL_CR_D1 0x089 +#define MSR_BBL_CR_D2 0x08a +#define MSR_BIOS_SIGN 0x08b +#define MSR_PERFCTR0 0x0c1 +#define MSR_PERFCTR1 0x0c2 +#define MSR_PLATFORM_INFO 0x0ce +#define MSR_MPERF 0x0e7 +#define MSR_APERF 0x0e8 +#define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ +#define MSR_MTRRcap 0x0fe +#define MSR_BBL_CR_ADDR 0x116 +#define MSR_BBL_CR_DECC 0x118 +#define MSR_BBL_CR_CTL 0x119 +#define MSR_BBL_CR_TRIG 0x11a +#define MSR_BBL_CR_BUSY 0x11b +#define MSR_BBL_CR_CTL3 0x11e +#define MSR_SYSENTER_CS_MSR 0x174 +#define MSR_SYSENTER_ESP_MSR 0x175 +#define MSR_SYSENTER_EIP_MSR 0x176 +#define MSR_MCG_CAP 0x179 +#define MSR_MCG_STATUS 0x17a +#define MSR_MCG_CTL 0x17b +#define MSR_EVNTSEL0 0x186 +#define MSR_EVNTSEL1 0x187 +#define MSR_THERM_CONTROL 0x19a +#define MSR_THERM_INTERRUPT 0x19b +#define MSR_THERM_STATUS 0x19c +#define MSR_IA32_MISC_ENABLE 0x1a0 +#define MSR_IA32_TEMPERATURE_TARGET 0x1a2 +#define MSR_TURBO_RATIO_LIMIT 0x1ad +#define MSR_TURBO_RATIO_LIMIT1 0x1ae +#define MSR_DEBUGCTLMSR 0x1d9 +#define MSR_LASTBRANCHFROMIP 0x1db +#define MSR_LASTBRANCHTOIP 0x1dc +#define MSR_LASTINTFROMIP 0x1dd +#define MSR_LASTINTTOIP 0x1de +#define MSR_ROB_CR_BKUPTMPDR6 0x1e0 +#define MSR_MTRRVarBase 0x200 +#define MSR_MTRR64kBase 0x250 +#define MSR_MTRR16kBase 0x258 +#define MSR_MTRR4kBase 0x268 +#define MSR_PAT 0x277 +#define MSR_MC0_CTL2 0x280 +#define MSR_MTRRdefType 0x2ff +#define MSR_MC0_CTL 0x400 +#define MSR_MC0_STATUS 0x401 +#define MSR_MC0_ADDR 0x402 +#define MSR_MC0_MISC 0x403 +#define MSR_MC1_CTL 0x404 +#define MSR_MC1_STATUS 0x405 +#define MSR_MC1_ADDR 0x406 +#define MSR_MC1_MISC 0x407 +#define MSR_MC2_CTL 0x408 +#define MSR_MC2_STATUS 0x409 +#define MSR_MC2_ADDR 0x40a +#define MSR_MC2_MISC 0x40b +#define MSR_MC3_CTL 0x40c +#define MSR_MC3_STATUS 0x40d +#define MSR_MC3_ADDR 0x40e +#define MSR_MC3_MISC 0x40f +#define MSR_MC4_CTL 0x410 +#define MSR_MC4_STATUS 0x411 +#define MSR_MC4_ADDR 0x412 +#define MSR_MC4_MISC 0x413 +#define MSR_RAPL_POWER_UNIT 0x606 +#define MSR_PKG_ENERGY_STATUS 0x611 +#define MSR_DRAM_ENERGY_STATUS 0x619 +#define MSR_PP0_ENERGY_STATUS 0x639 +#define MSR_PP1_ENERGY_STATUS 0x641 + +/* + * VMX MSRs + */ +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 +#define MSR_VMX_PROCBASED_CTLS2 0x48b +#define MSR_VMX_EPT_VPID_CAP 0x48c +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48d +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +/* + * X2APIC MSRs + */ +#define MSR_APIC_ID 0x802 +#define MSR_APIC_VERSION 0x803 +#define MSR_APIC_TPR 0x808 +#define MSR_APIC_EOI 0x80b +#define MSR_APIC_LDR 0x80d +#define MSR_APIC_SVR 0x80f +#define MSR_APIC_ISR0 0x810 +#define MSR_APIC_ISR1 0x811 +#define MSR_APIC_ISR2 0x812 +#define MSR_APIC_ISR3 0x813 +#define MSR_APIC_ISR4 0x814 +#define MSR_APIC_ISR5 0x815 +#define MSR_APIC_ISR6 0x816 +#define MSR_APIC_ISR7 0x817 +#define MSR_APIC_TMR0 0x818 +#define MSR_APIC_IRR0 0x820 +#define MSR_APIC_ESR 0x828 +#define MSR_APIC_LVT_CMCI 0x82F +#define MSR_APIC_ICR 0x830 +#define MSR_APIC_LVT_TIMER 0x832 +#define MSR_APIC_LVT_THERMAL 0x833 +#define MSR_APIC_LVT_PCINT 0x834 +#define MSR_APIC_LVT_LINT0 0x835 +#define MSR_APIC_LVT_LINT1 0x836 +#define MSR_APIC_LVT_ERROR 0x837 +#define MSR_APIC_ICR_TIMER 0x838 +#define MSR_APIC_CCR_TIMER 0x839 +#define MSR_APIC_DCR_TIMER 0x83e +#define MSR_APIC_SELF_IPI 0x83f + +#define MSR_IA32_XSS 0xda0 + +/* + * Constants related to MSR's. + */ +#define APICBASE_RESERVED 0x000002ff +#define APICBASE_BSP 0x00000100 +#define APICBASE_X2APIC 0x00000400 +#define APICBASE_ENABLED 0x00000800 +#define APICBASE_ADDRESS 0xfffff000 + +/* MSR_IA32_FEATURE_CONTROL related */ +#define IA32_FEATURE_CONTROL_LOCK 0x01 /* lock bit */ +#define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ +#define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ + +/* + * PAT modes. + */ +#define PAT_UNCACHEABLE 0x00 +#define PAT_WRITE_COMBINING 0x01 +#define PAT_WRITE_THROUGH 0x04 +#define PAT_WRITE_PROTECTED 0x05 +#define PAT_WRITE_BACK 0x06 +#define PAT_UNCACHED 0x07 +#define PAT_VALUE(i, m) ((long long)(m) << (8 * (i))) +#define PAT_MASK(i) PAT_VALUE(i, 0xff) + +/* + * Constants related to MTRRs + */ +#define MTRR_UNCACHEABLE 0x00 +#define MTRR_WRITE_COMBINING 0x01 +#define MTRR_WRITE_THROUGH 0x04 +#define MTRR_WRITE_PROTECTED 0x05 +#define MTRR_WRITE_BACK 0x06 +#define MTRR_N64K 8 /* numbers of fixed-size entries */ +#define MTRR_N16K 16 +#define MTRR_N4K 64 +#define MTRR_CAP_WC 0x0000000000000400 +#define MTRR_CAP_FIXED 0x0000000000000100 +#define MTRR_CAP_VCNT 0x00000000000000ff +#define MTRR_DEF_ENABLE 0x0000000000000800 +#define MTRR_DEF_FIXED_ENABLE 0x0000000000000400 +#define MTRR_DEF_TYPE 0x00000000000000ff +#define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000 +#define MTRR_PHYSBASE_TYPE 0x00000000000000ff +#define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000 +#define MTRR_PHYSMASK_VALID 0x0000000000000800 + +/* + * Cyrix configuration registers, accessible as IO ports. + */ +#define CCR0 0xc0 /* Configuration control register 0 */ +#define CCR0_NC0 0x01 /* First 64K of each 1M memory region is + non-cacheable */ +#define CCR0_NC1 0x02 /* 640K-1M region is non-cacheable */ +#define CCR0_A20M 0x04 /* Enables A20M# input pin */ +#define CCR0_KEN 0x08 /* Enables KEN# input pin */ +#define CCR0_FLUSH 0x10 /* Enables FLUSH# input pin */ +#define CCR0_BARB 0x20 /* Flushes internal cache when entering hold + state */ +#define CCR0_CO 0x40 /* Cache org: 1=direct mapped, 0=2x set + assoc */ +#define CCR0_SUSPEND 0x80 /* Enables SUSP# and SUSPA# pins */ + +#define CCR1 0xc1 /* Configuration control register 1 */ +#define CCR1_RPL 0x01 /* Enables RPLSET and RPLVAL# pins */ +#define CCR1_SMI 0x02 /* Enables SMM pins */ +#define CCR1_SMAC 0x04 /* System management memory access */ +#define CCR1_MMAC 0x08 /* Main memory access */ +#define CCR1_NO_LOCK 0x10 /* Negate LOCK# */ +#define CCR1_SM3 0x80 /* SMM address space address region 3 */ + +#define CCR2 0xc2 +#define CCR2_WB 0x02 /* Enables WB cache interface pins */ +#define CCR2_SADS 0x02 /* Slow ADS */ +#define CCR2_LOCK_NW 0x04 /* LOCK NW Bit */ +#define CCR2_SUSP_HLT 0x08 /* Suspend on HALT */ +#define CCR2_WT1 0x10 /* WT region 1 */ +#define CCR2_WPR1 0x10 /* Write-protect region 1 */ +#define CCR2_BARB 0x20 /* Flushes write-back cache when entering + hold state. */ +#define CCR2_BWRT 0x40 /* Enables burst write cycles */ +#define CCR2_USE_SUSP 0x80 /* Enables suspend pins */ + +#define CCR3 0xc3 +#define CCR3_SMILOCK 0x01 /* SMM register lock */ +#define CCR3_NMI 0x02 /* Enables NMI during SMM */ +#define CCR3_LINBRST 0x04 /* Linear address burst cycles */ +#define CCR3_SMMMODE 0x08 /* SMM Mode */ +#define CCR3_MAPEN0 0x10 /* Enables Map0 */ +#define CCR3_MAPEN1 0x20 /* Enables Map1 */ +#define CCR3_MAPEN2 0x40 /* Enables Map2 */ +#define CCR3_MAPEN3 0x80 /* Enables Map3 */ + +#define CCR4 0xe8 +#define CCR4_IOMASK 0x07 +#define CCR4_MEM 0x08 /* Enables momory bypassing */ +#define CCR4_DTE 0x10 /* Enables directory table entry cache */ +#define CCR4_FASTFPE 0x20 /* Fast FPU exception */ +#define CCR4_CPUID 0x80 /* Enables CPUID instruction */ + +#define CCR5 0xe9 +#define CCR5_WT_ALLOC 0x01 /* Write-through allocate */ +#define CCR5_SLOP 0x02 /* LOOP instruction slowed down */ +#define CCR5_LBR1 0x10 /* Local bus region 1 */ +#define CCR5_ARREN 0x20 /* Enables ARR region */ + +#define CCR6 0xea + +#define CCR7 0xeb + +/* Performance Control Register (5x86 only). */ +#define PCR0 0x20 +#define PCR0_RSTK 0x01 /* Enables return stack */ +#define PCR0_BTB 0x02 /* Enables branch target buffer */ +#define PCR0_LOOP 0x04 /* Enables loop */ +#define PCR0_AIS 0x08 /* Enables all instrcutions stalled to + serialize pipe. */ +#define PCR0_MLR 0x10 /* Enables reordering of misaligned loads */ +#define PCR0_BTBRT 0x40 /* Enables BTB test register. */ +#define PCR0_LSSER 0x80 /* Disable reorder */ + +/* Device Identification Registers */ +#define DIR0 0xfe +#define DIR1 0xff + +/* + * Machine Check register constants. + */ +#define MCG_CAP_COUNT 0x000000ff +#define MCG_CAP_CTL_P 0x00000100 +#define MCG_CAP_EXT_P 0x00000200 +#define MCG_CAP_CMCI_P 0x00000400 +#define MCG_CAP_TES_P 0x00000800 +#define MCG_CAP_EXT_CNT 0x00ff0000 +#define MCG_CAP_SER_P 0x01000000 +#define MCG_STATUS_RIPV 0x00000001 +#define MCG_STATUS_EIPV 0x00000002 +#define MCG_STATUS_MCIP 0x00000004 +#define MCG_CTL_ENABLE 0xffffffffffffffff +#define MCG_CTL_DISABLE 0x0000000000000000 +#define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4) +#define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) +#define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) +#define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) +#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_MCA_ERROR 0x000000000000ffff +#define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 +#define MC_STATUS_OTHER_INFO 0x01ffffff00000000 +#define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_PCC 0x0200000000000000 +#define MC_STATUS_ADDRV 0x0400000000000000 +#define MC_STATUS_MISCV 0x0800000000000000 +#define MC_STATUS_EN 0x1000000000000000 +#define MC_STATUS_UC 0x2000000000000000 +#define MC_STATUS_OVER 0x4000000000000000 +#define MC_STATUS_VAL 0x8000000000000000 +#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ +#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ +#define MC_CTL2_THRESHOLD 0x0000000000007fff +#define MC_CTL2_CMCI_EN 0x0000000040000000 + +/* + * The following four 3-byte registers control the non-cacheable regions. + * These registers must be written as three separate bytes. + * + * NCRx+0: A31-A24 of starting address + * NCRx+1: A23-A16 of starting address + * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx. + * + * The non-cacheable region's starting address must be aligned to the + * size indicated by the NCR_SIZE_xx field. + */ +#define NCR1 0xc4 +#define NCR2 0xc7 +#define NCR3 0xca +#define NCR4 0xcd + +#define NCR_SIZE_0K 0 +#define NCR_SIZE_4K 1 +#define NCR_SIZE_8K 2 +#define NCR_SIZE_16K 3 +#define NCR_SIZE_32K 4 +#define NCR_SIZE_64K 5 +#define NCR_SIZE_128K 6 +#define NCR_SIZE_256K 7 +#define NCR_SIZE_512K 8 +#define NCR_SIZE_1M 9 +#define NCR_SIZE_2M 10 +#define NCR_SIZE_4M 11 +#define NCR_SIZE_8M 12 +#define NCR_SIZE_16M 13 +#define NCR_SIZE_32M 14 +#define NCR_SIZE_4G 15 + +/* + * The address region registers are used to specify the location and + * size for the eight address regions. + * + * ARRx + 0: A31-A24 of start address + * ARRx + 1: A23-A16 of start address + * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx + */ +#define ARR0 0xc4 +#define ARR1 0xc7 +#define ARR2 0xca +#define ARR3 0xcd +#define ARR4 0xd0 +#define ARR5 0xd3 +#define ARR6 0xd6 +#define ARR7 0xd9 + +#define ARR_SIZE_0K 0 +#define ARR_SIZE_4K 1 +#define ARR_SIZE_8K 2 +#define ARR_SIZE_16K 3 +#define ARR_SIZE_32K 4 +#define ARR_SIZE_64K 5 +#define ARR_SIZE_128K 6 +#define ARR_SIZE_256K 7 +#define ARR_SIZE_512K 8 +#define ARR_SIZE_1M 9 +#define ARR_SIZE_2M 10 +#define ARR_SIZE_4M 11 +#define ARR_SIZE_8M 12 +#define ARR_SIZE_16M 13 +#define ARR_SIZE_32M 14 +#define ARR_SIZE_4G 15 + +/* + * The region control registers specify the attributes associated with + * the ARRx addres regions. + */ +#define RCR0 0xdc +#define RCR1 0xdd +#define RCR2 0xde +#define RCR3 0xdf +#define RCR4 0xe0 +#define RCR5 0xe1 +#define RCR6 0xe2 +#define RCR7 0xe3 + +#define RCR_RCD 0x01 /* Disables caching for ARRx (x = 0-6). */ +#define RCR_RCE 0x01 /* Enables caching for ARR7. */ +#define RCR_WWO 0x02 /* Weak write ordering. */ +#define RCR_WL 0x04 /* Weak locking. */ +#define RCR_WG 0x08 /* Write gathering. */ +#define RCR_WT 0x10 /* Write-through. */ +#define RCR_NLB 0x20 /* LBA# pin is not asserted. */ + +/* AMD Write Allocate Top-Of-Memory and Control Register */ +#define AMD_WT_ALLOC_TME 0x40000 /* top-of-memory enable */ +#define AMD_WT_ALLOC_PRE 0x20000 /* programmable range enable */ +#define AMD_WT_ALLOC_FRE 0x10000 /* fixed (A0000-FFFFF) range enable */ + +/* AMD64 MSR's */ +#define MSR_EFER 0xc0000080 /* extended features */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */ +#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ +#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ +#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ +#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_PERFEVSEL0 0xc0010000 +#define MSR_PERFEVSEL1 0xc0010001 +#define MSR_PERFEVSEL2 0xc0010002 +#define MSR_PERFEVSEL3 0xc0010003 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_K7_PERFCTR1 0xc0010005 +#define MSR_K7_PERFCTR2 0xc0010006 +#define MSR_K7_PERFCTR3 0xc0010007 +#define MSR_SYSCFG 0xc0010010 +#define MSR_HWCR 0xc0010015 +#define MSR_IORRBASE0 0xc0010016 +#define MSR_IORRMASK0 0xc0010017 +#define MSR_IORRBASE1 0xc0010018 +#define MSR_IORRMASK1 0xc0010019 +#define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ +#define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ +#define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ +#define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ +#define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ +#define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ +#define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ +#define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ +#define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ +#define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ +#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 +#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ +#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ + +/* MSR_VM_CR related */ +#define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ + +/* VIA ACE crypto featureset: for via_feature_rng */ +#define VIA_HAS_RNG 1 /* cpu has RNG */ + +/* VIA ACE crypto featureset: for via_feature_xcrypt */ +#define VIA_HAS_AES 1 /* cpu has AES */ +#define VIA_HAS_SHA 2 /* cpu has SHA1 & SHA256 */ +#define VIA_HAS_MM 4 /* cpu has RSA instructions */ +#define VIA_HAS_AESCTR 8 /* cpu has AES-CTR instructions */ + +/* Centaur Extended Feature flags */ +#define VIA_CPUID_HAS_RNG 0x000004 +#define VIA_CPUID_DO_RNG 0x000008 +#define VIA_CPUID_HAS_ACE 0x000040 +#define VIA_CPUID_DO_ACE 0x000080 +#define VIA_CPUID_HAS_ACE2 0x000100 +#define VIA_CPUID_DO_ACE2 0x000200 +#define VIA_CPUID_HAS_PHE 0x000400 +#define VIA_CPUID_DO_PHE 0x000800 +#define VIA_CPUID_HAS_PMM 0x001000 +#define VIA_CPUID_DO_PMM 0x002000 + +/* VIA ACE xcrypt-* instruction context control options */ +#define VIA_CRYPT_CWLO_ROUND_M 0x0000000f +#define VIA_CRYPT_CWLO_ALG_M 0x00000070 +#define VIA_CRYPT_CWLO_ALG_AES 0x00000000 +#define VIA_CRYPT_CWLO_KEYGEN_M 0x00000080 +#define VIA_CRYPT_CWLO_KEYGEN_HW 0x00000000 +#define VIA_CRYPT_CWLO_KEYGEN_SW 0x00000080 +#define VIA_CRYPT_CWLO_NORMAL 0x00000000 +#define VIA_CRYPT_CWLO_INTERMEDIATE 0x00000100 +#define VIA_CRYPT_CWLO_ENCRYPT 0x00000000 +#define VIA_CRYPT_CWLO_DECRYPT 0x00000200 +#define VIA_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ +#define VIA_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ +#define VIA_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ + +#endif /* !_MACHINE_SPECIALREG_H_ */ diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile new file mode 100644 index 0000000000..f47daead31 --- /dev/null +++ b/usr/src/cmd/bhyve/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2014 Pluribus Networks Inc. +# + +PROG = bhyve + +include ../Makefile.cmd + +$(BUILD64)SUBDIRS += $(MACH64) + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +.KEEP_STATE: + +all clean clobber lint: $(SUBDIRS) + +install: $(SUBDIRS) + -$(RM) $(ROOTUSRSBINPROG) + -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com new file mode 100644 index 0000000000..4a92b622ab --- /dev/null +++ b/usr/src/cmd/bhyve/Makefile.com @@ -0,0 +1,94 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Pluribus Networks Inc. +# + +PROG= bhyve + +SRCS = atkbdc.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + console.c \ + consport.c \ + inout.c \ + ioapic.c \ + mem.c \ + mptbl.c \ + pci_ahci.c \ + pci_emul.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_virtio_block.c \ + pci_virtio_net.c \ + pci_virtio_viona.c \ + pm.c \ + pmtmr.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + uart_emul.c \ + vga.c \ + virtio.c \ + vmm_instruction_emul.c \ + xmsr.c \ + spinup_ap.c \ + bhyve_sol_glue.c + +OBJS = $(SRCS:.c=.o) + +include ../../Makefile.cmd + +.KEEP_STATE: + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration +CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ + -I$(ROOT)/usr/platform/i86pc/include \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -I$(SRC)/lib/libdladm/common +LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi + +POST_PROCESS += ; $(GENSETDEFS) $@ + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) + +lint: lint_SRCS + +include ../../Makefile.targ + +%.o: ../%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +%.o: $(SRC)/uts/i86pc/io/vmm/%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +%.o: ../%.s + $(COMPILE.s) $< diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h new file mode 100644 index 0000000000..477f827286 --- /dev/null +++ b/usr/src/cmd/bhyve/acpi.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $ + */ + +#ifndef _ACPI_H_ +#define _ACPI_H_ + +#define SCI_INT 9 + +#define SMI_CMD 0xb2 +#define BHYVE_ACPI_ENABLE 0xa0 +#define BHYVE_ACPI_DISABLE 0xa1 + +#define PM1A_EVT_ADDR 0x400 +#define PM1A_CNT_ADDR 0x404 + +#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */ + +struct vmctx; + +int acpi_build(struct vmctx *ctx, int ncpu); +void dsdt_line(const char *fmt, ...); +void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); +void dsdt_fixed_irq(uint8_t irq); +void dsdt_fixed_mem32(uint32_t base, uint32_t length); +void dsdt_indent(int levels); +void dsdt_unindent(int levels); +void sci_init(struct vmctx *ctx); + +#endif /* _ACPI_H_ */ diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h new file mode 100644 index 0000000000..1cf09adcbf --- /dev/null +++ b/usr/src/cmd/bhyve/ahci.h @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org> + * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $ + */ + +#ifndef _AHCI_H_ +#define _AHCI_H_ + +/* ATA register defines */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ + +/* SATA register defines */ +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000040 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000040 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 + +/* SATA AHCI v1.0 register defines */ +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 + +/* Just to be sure, if building as module. */ +#if MAXPHYS < 512 * 1024 +#undef MAXPHYS +#define MAXPHYS 512 * 1024 +#endif +/* Pessimistic prognosis on number of required S/G entries */ +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +/* Command list. 32 commands. First, 1Kbyte aligned. */ +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 +/* Command tables. Up to 32 commands, Each, 128byte aligned. */ +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +/* Total main work area. */ +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) + +#endif /* _AHCI_H_ */ diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/amd64/Makefile new file mode 100644 index 0000000000..13cdae6663 --- /dev/null +++ b/usr/src/cmd/bhyve/amd64/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Pluribus Networks Inc. +# + +include ../Makefile.com +include ../../Makefile.cmd.64 + +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c new file mode 100644 index 0000000000..4d09d88266 --- /dev/null +++ b/usr/src/cmd/bhyve/atkbdc.c @@ -0,0 +1,576 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $"); + +#include <sys/types.h> + +#include <machine/vmm.h> + +#include <vmmapi.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "ps2kbd.h" +#include "ps2mouse.h" + +#define KBD_DATA_PORT 0x60 + +#define KBD_STS_CTL_PORT 0x64 + +#define KBDC_RESET 0xfe + +#define KBD_DEV_IRQ 1 +#define AUX_DEV_IRQ 12 + +/* controller commands */ +#define KBDC_SET_COMMAND_BYTE 0x60 +#define KBDC_GET_COMMAND_BYTE 0x20 +#define KBDC_DISABLE_AUX_PORT 0xa7 +#define KBDC_ENABLE_AUX_PORT 0xa8 +#define KBDC_TEST_AUX_PORT 0xa9 +#define KBDC_TEST_CTRL 0xaa +#define KBDC_TEST_KBD_PORT 0xab +#define KBDC_DISABLE_KBD_PORT 0xad +#define KBDC_ENABLE_KBD_PORT 0xae +#define KBDC_READ_INPORT 0xc0 +#define KBDC_READ_OUTPORT 0xd0 +#define KBDC_WRITE_OUTPORT 0xd1 +#define KBDC_WRITE_KBD_OUTBUF 0xd2 +#define KBDC_WRITE_AUX_OUTBUF 0xd3 +#define KBDC_WRITE_TO_AUX 0xd4 + +/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */ +#define KBD_TRANSLATION 0x40 +#define KBD_SYS_FLAG_BIT 0x04 +#define KBD_DISABLE_KBD_PORT 0x10 +#define KBD_DISABLE_AUX_PORT 0x20 +#define KBD_ENABLE_AUX_INT 0x02 +#define KBD_ENABLE_KBD_INT 0x01 +#define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT) +#define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT) + +/* controller status bits */ +#define KBDS_KBD_BUFFER_FULL 0x01 +#define KBDS_SYS_FLAG 0x04 +#define KBDS_CTRL_FLAG 0x08 +#define KBDS_AUX_BUFFER_FULL 0x20 + +/* controller output port */ +#define KBDO_KBD_OUTFULL 0x10 +#define KBDO_AUX_OUTFULL 0x20 + +#define RAMSZ 32 + +struct kbd_dev { + bool irq_active; + int irq; + + uint8_t buffer; +}; + +struct aux_dev { + bool irq_active; + int irq; + + uint8_t buffer; +}; + +struct atkbdc_softc { + struct vmctx *ctx; + pthread_mutex_t mtx; + + struct ps2kbd_softc *ps2kbd_sc; + struct ps2mouse_softc *ps2mouse_sc; + + uint8_t status; /* status register */ + uint8_t outport; /* controller output port */ + uint8_t ram[RAMSZ]; /* byte0 = controller config */ + + uint32_t curcmd; /* current command for next byte */ + + struct kbd_dev kbd; + struct aux_dev aux; +}; + +static void +atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) +{ + if (!sc->kbd.irq_active && + (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { + sc->kbd.irq_active = true; + vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); + } +} + +static void +atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc) +{ + if (sc->kbd.irq_active) { + vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); + sc->kbd.irq_active = false; + } +} + +static void +atkbdc_assert_aux_intr(struct atkbdc_softc *sc) +{ + if (!sc->aux.irq_active && + (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { + sc->aux.irq_active = true; + vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); + } +} + +static void +atkbdc_deassert_aux_intr(struct atkbdc_softc *sc) +{ + if (sc->aux.irq_active) { + vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); + sc->aux.irq_active = false; + } +} + +static void +atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + sc->aux.buffer = val; + sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + sc->outport |= KBDO_AUX_OUTFULL; + atkbdc_assert_aux_intr(sc); +} + +static void +atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + sc->kbd.buffer = val; + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_KBD_OUTFULL; + atkbdc_assert_kbd_intr(sc); +} + +static void +atkbdc_aux_read(struct atkbdc_softc *sc) +{ + uint8_t val; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1) + atkbdc_aux_queue_data(sc, val); +} + +static void +atkbdc_kbd_read(struct atkbdc_softc *sc) +{ + const uint8_t translation[256] = { + 0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58, + 0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59, + 0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a, + 0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b, + 0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c, + 0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d, + 0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e, + 0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f, + 0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60, + 0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61, + 0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e, + 0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76, + 0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b, + 0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f, + 0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45, + 0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54, + 0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff + }; + uint8_t val; + uint8_t release = 0; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (sc->ram[0] & KBD_TRANSLATION) { + while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) { + if (val == 0xf0) { + release = 0x80; + continue; + } else { + val = translation[val] | release; + } + + atkbdc_kbd_queue_data(sc, val); + break; + } + } else { + if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) + atkbdc_kbd_queue_data(sc, val); + } +} + +static void +atkbdc_aux_poll(struct atkbdc_softc *sc) +{ + if ((sc->outport & KBDO_AUX_OUTFULL) == 0) + atkbdc_aux_read(sc); +} + +static void +atkbdc_kbd_poll(struct atkbdc_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if ((sc->outport & KBDO_KBD_OUTFULL) == 0) + atkbdc_kbd_read(sc); +} + +static void +atkbdc_poll(struct atkbdc_softc *sc) +{ + atkbdc_aux_poll(sc); + atkbdc_kbd_poll(sc); +} + +static void +atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + if (sc->outport & KBDO_AUX_OUTFULL) { + *buf = sc->aux.buffer; + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + sc->outport &= ~KBDO_AUX_OUTFULL; + atkbdc_deassert_aux_intr(sc); + + atkbdc_poll(sc); + return; + } + + *buf = sc->kbd.buffer; + sc->status &= ~KBDS_KBD_BUFFER_FULL; + sc->outport &= ~KBDO_KBD_OUTFULL; + atkbdc_deassert_kbd_intr(sc); + + atkbdc_poll(sc); +} + +static int +atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct atkbdc_softc *sc; + uint8_t buf; + int retval; + + if (bytes != 1) + return (-1); + + sc = arg; + retval = 0; + + pthread_mutex_lock(&sc->mtx); + if (in) { + sc->curcmd = 0; + sc->status &= ~KBDS_CTRL_FLAG; + + /* read device buffer; includes kbd cmd responses */ + atkbdc_dequeue_data(sc, &buf); + *eax = buf; + + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + if (sc->status & KBDS_CTRL_FLAG) { + /* + * Command byte for the controller. + */ + switch (sc->curcmd) { + case KBDC_SET_COMMAND_BYTE: + sc->ram[0] = *eax; + if (sc->ram[0] & KBD_SYS_FLAG_BIT) + sc->status |= KBDS_SYS_FLAG; + else + sc->status &= KBDS_SYS_FLAG; + if (sc->outport & KBDO_AUX_OUTFULL) + atkbdc_assert_aux_intr(sc); + else if (sc->outport & KBDO_KBD_OUTFULL) + atkbdc_assert_kbd_intr(sc); + break; + case KBDC_WRITE_OUTPORT: + sc->outport = *eax; + if (sc->outport & KBDO_AUX_OUTFULL) + sc->status |= (KBDS_AUX_BUFFER_FULL | + KBDS_KBD_BUFFER_FULL); + if (sc->outport & KBDO_KBD_OUTFULL) + sc->status |= KBDS_KBD_BUFFER_FULL; + break; + case KBDC_WRITE_TO_AUX: + ps2mouse_write(sc->ps2mouse_sc, *eax); + atkbdc_poll(sc); + break; + case KBDC_WRITE_KBD_OUTBUF: + atkbdc_kbd_queue_data(sc, *eax); + break; + case KBDC_WRITE_AUX_OUTBUF: + atkbdc_aux_queue_data(sc, *eax); + break; + default: + /* write to particular RAM byte */ + if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) { + int byten; + + byten = (sc->curcmd - 0x60) & 0x1f; + sc->ram[byten] = *eax & 0xff; + } + break; + } + + sc->curcmd = 0; + sc->status &= ~KBDS_CTRL_FLAG; + + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + /* + * Data byte for the device. + */ + ps2kbd_write(sc->ps2kbd_sc, *eax); + atkbdc_poll(sc); + + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + + +static int +atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + struct atkbdc_softc *sc; + int error, retval; + + if (bytes != 1) + return (-1); + + sc = arg; + retval = 0; + + pthread_mutex_lock(&sc->mtx); + + if (in) { + /* read status register */ + *eax = sc->status; + pthread_mutex_unlock(&sc->mtx); + return (retval); + } + + sc->curcmd = 0; + sc->status |= KBDS_CTRL_FLAG; + + switch (*eax) { + case KBDC_GET_COMMAND_BYTE: + atkbdc_kbd_queue_data(sc, sc->ram[0]); + break; + case KBDC_TEST_CTRL: + atkbdc_kbd_queue_data(sc, 0x55); + break; + case KBDC_TEST_AUX_PORT: + case KBDC_TEST_KBD_PORT: + atkbdc_kbd_queue_data(sc, 0); + break; + case KBDC_READ_INPORT: + atkbdc_kbd_queue_data(sc, 0); + break; + case KBDC_READ_OUTPORT: + atkbdc_kbd_queue_data(sc, sc->outport); + break; + case KBDC_SET_COMMAND_BYTE: + case KBDC_WRITE_OUTPORT: + case KBDC_WRITE_KBD_OUTBUF: + case KBDC_WRITE_AUX_OUTBUF: + sc->curcmd = *eax; + break; + case KBDC_DISABLE_KBD_PORT: + sc->ram[0] |= KBD_DISABLE_KBD_PORT; + break; + case KBDC_ENABLE_KBD_PORT: + sc->ram[0] &= ~KBD_DISABLE_KBD_PORT; + atkbdc_poll(sc); + break; + case KBDC_WRITE_TO_AUX: + sc->curcmd = *eax; + break; + case KBDC_DISABLE_AUX_PORT: + sc->ram[0] |= KBD_DISABLE_AUX_PORT; + break; + case KBDC_ENABLE_AUX_PORT: + sc->ram[0] &= ~KBD_DISABLE_AUX_PORT; + break; + case KBDC_RESET: /* Pulse "reset" line */ +#ifdef __FreeBSD__ + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); +#else + exit(0); +#endif + break; + default: + if (*eax >= 0x21 && *eax <= 0x3f) { + /* read "byte N" from RAM */ + int byten; + + byten = (*eax - 0x20) & 0x1f; + atkbdc_kbd_queue_data(sc, sc->ram[byten]); + } + break; + } + + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +void +atkbdc_event(struct atkbdc_softc *sc) +{ + pthread_mutex_lock(&sc->mtx); + atkbdc_poll(sc); + pthread_mutex_unlock(&sc->mtx); +} + +void +atkbdc_init(struct vmctx *ctx) +{ + struct inout_port iop; + struct atkbdc_softc *sc; + int error; + + sc = calloc(1, sizeof(struct atkbdc_softc)); + sc->ctx = ctx; + + pthread_mutex_init(&sc->mtx, NULL); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "atkdbc"; + iop.port = KBD_STS_CTL_PORT; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = atkbdc_sts_ctl_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "atkdbc"; + iop.port = KBD_DATA_PORT; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = atkbdc_data_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + + pci_irq_reserve(KBD_DEV_IRQ); + sc->kbd.irq = KBD_DEV_IRQ; + + pci_irq_reserve(AUX_DEV_IRQ); + sc->aux.irq = AUX_DEV_IRQ; + + sc->ps2kbd_sc = ps2kbd_init(sc); + sc->ps2mouse_sc = ps2mouse_init(sc); +} + +#ifdef __FreeBSD__ +static void +atkbdc_dsdt(void) +{ + + dsdt_line(""); + dsdt_line("Device (KBD)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(KBD_DATA_PORT, 1); + dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); + dsdt_fixed_irq(1); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (MOU)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(KBD_DATA_PORT, 1); + dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); + dsdt_fixed_irq(12); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(atkbdc_dsdt); +#endif diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h new file mode 100644 index 0000000000..48b3a8b00c --- /dev/null +++ b/usr/src/cmd/bhyve/atkbdc.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ATKBDC_H_ +#define _ATKBDC_H_ + +struct atkbdc_softc; +struct vmctx; + +void atkbdc_init(struct vmctx *ctx); +void atkbdc_event(struct atkbdc_softc *sc); + +#endif /* _ATKBDC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c new file mode 100644 index 0000000000..633faacc5f --- /dev/null +++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/uio.h> + +#include <termios.h> +#include <unistd.h> + +/* + * Make a pre-existing termios structure into "raw" mode: character-at-a-time + * mode with no characters interpreted, 8-bit data path. + */ +void +cfmakeraw(struct termios *t) +{ + t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR); + t->c_iflag |= IGNBRK; + t->c_oflag &= ~OPOST; + t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN); + t->c_cflag &= ~(CSIZE|PARENB); + t->c_cflag |= CS8|CREAD; + t->c_cc[VMIN] = 1; + t->c_cc[VTIME] = 0; +} + +ssize_t +preadv(int d, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t old_offset; + ssize_t n; + + old_offset = lseek(d, (off_t)0, SEEK_CUR); + if (old_offset == -1) + return (-1); + + offset = lseek(d, offset, SEEK_SET); + if (offset == -1) + return (-1); + + n = readv(d, iov, iovcnt); + if (n == -1) + return (-1); + + offset = lseek(d, old_offset, SEEK_SET); + if (offset == -1) + return (-1); + + return (n); +} + +ssize_t +pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset) +{ + off_t old_offset; + ssize_t n; + + old_offset = lseek(d, (off_t)0, SEEK_CUR); + if (old_offset == -1) + return (-1); + + offset = lseek(d, offset, SEEK_SET); + if (offset == -1) + return (-1); + + n = writev(d, iov, iovcnt); + if (n == -1) + return (-1); + + offset = lseek(d, old_offset, SEEK_SET); + if (offset == -1) + return (-1); + + return (n); +} diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c new file mode 100644 index 0000000000..7a13c4c83f --- /dev/null +++ b/usr/src/cmd/bhyve/bhyvegc.c @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "bhyvegc.h" + +struct bhyvegc { + struct bhyvegc_image *gc_image; +}; + +struct bhyvegc * +bhyvegc_init(int width, int height) +{ + struct bhyvegc *gc; + struct bhyvegc_image *gc_image; + + gc = calloc(1, sizeof (struct bhyvegc)); + + gc_image = calloc(1, sizeof(struct bhyvegc_image)); + gc_image->width = width; + gc_image->height = height; + gc_image->data = calloc(width * height, sizeof (uint32_t)); + + gc->gc_image = gc_image; + + return (gc); +} + +void +bhyvegc_resize(struct bhyvegc *gc, int width, int height) +{ + struct bhyvegc_image *gc_image; + + gc_image = gc->gc_image; + + gc_image->width = width; + gc_image->height = height; + gc_image->data = realloc(gc_image->data, + sizeof (uint32_t) * width * height); + memset(gc_image->data, 0, width * height * sizeof (uint32_t)); +} + +struct bhyvegc_image * +bhyvegc_get_image(struct bhyvegc *gc) +{ + return (gc->gc_image); +} diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h new file mode 100644 index 0000000000..19648f98af --- /dev/null +++ b/usr/src/cmd/bhyve/bhyvegc.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BHYVEGC_H_ +#define _BHYVEGC_H_ + +struct bhyvegc; + +struct bhyvegc_image { + int width; + int height; + uint32_t *data; +}; + +struct bhyvegc *bhyvegc_init(int width, int height); +void bhyvegc_resize(struct bhyvegc *gc, int width, int height); +struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc); + +#endif /* _BHYVEGC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c new file mode 100644 index 0000000000..b985a2286e --- /dev/null +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -0,0 +1,820 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $"); + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/time.h> + +#include <machine/segments.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <libgen.h> +#include <unistd.h> +#include <assert.h> +#include <errno.h> +#include <pthread.h> +#include <pthread_np.h> +#include <sysexits.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "acpi.h" +#include "atkbdc.h" +#include "console.h" +#include "inout.h" +#include "dbgport.h" +#include "ioapic.h" +#include "mem.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rfb.h" +#include "rtc.h" +#include "vga.h" + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); + +char *vmname; + +int guest_ncpus; +char *guest_uuid_str; + +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; +static int virtio_msix = 1; +static int x2apic_mode = 0; /* default is xAPIC */ + +static int strictio; +static int strictmsr = 1; + +#ifdef __FreeBSD__ +static int acpi; +#endif + +static char *progname; +static const int BSP = 0; + +#ifndef __FreeBSD__ +int bcons_wait = 0; +int bcons_connected = 0; +pthread_mutex_t bcons_wait_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t bcons_wait_done = PTHREAD_COND_INITIALIZER; +#endif + +static cpuset_t cpumask; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); + +static struct vm_exit vmexit[VM_MAXCPU]; + +struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_bogus_switch; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; + int io_reset; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static void +usage(int code) +{ + +#ifdef __FreeBSD__ + fprintf(stderr, + "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n" + " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" + " -a: local apic is in xAPIC mode (deprecated)\n" + " -A: create an ACPI table\n" + " -g: gdb port\n" + " -c: # cpus (default 1)\n" + " -C: include guest memory in core file\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -H: vmexit from the guest on hlt\n" + " -P: vmexit from the guest on pause\n" + " -W: force virtio to use single-vector MSI\n" + " -e: exit on unhandled I/O access\n" + " -h: help\n" + " -s: <slot,driver,configinfo> PCI slot config\n" + " -l: LPC device configuration\n" + " -m: memory size in MB\n" + " -w: ignore unimplemented MSRs\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n" + " -U: uuid\n", + progname, (int)strlen(progname), ""); +#else + fprintf(stderr, + "Usage: %s [-ehwHPW] [-s <pci>] [-c vcpus]\n" + " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" + " -c: # cpus (default 1)\n" + " -H: vmexit from the guest on hlt\n" + " -P: vmexit from the guest on pause\n" + " -W: force virtio to use single-vector MSI\n" + " -e: exit on unhandled I/O access\n" + " -h: help\n" + " -s: <slot,driver,configinfo> PCI slot config\n" + " -l: LPC device configuration\n" + " -m: memory size in MB\n" + " -w: ignore unimplemented MSRs\n" + " -Y: disable MPtable generation\n" + " -U: uuid\n", + progname, (int)strlen(progname), ""); +#endif + + exit(code); +} + +void +vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, + int errcode) +{ + struct vmctx *ctx; + int error, restart_instruction; + + ctx = arg; + restart_instruction = 1; + + error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); +} + +void * +paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) +{ + + return (vm_map_gpa(ctx, gaddr, len)); +} + +int +fbsdrun_vmexit_on_pause(void) +{ + + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_virtio_msix(void) +{ + + return (virtio_msix); +} + +static void * +fbsdrun_start_thread(void *param) +{ + char tname[MAXCOMLEN + 1]; + struct mt_vmm_info *mtp; + int vcpu; + + mtp = param; + vcpu = mtp->mt_vcpu; + + snprintf(tname, sizeof(tname), "vcpu %d", vcpu); + pthread_set_name_np(mtp->mt_thr, tname); + + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + + /* not reached */ + exit(1); + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(fromcpu == BSP); + + /* + * The 'newcpu' must be activated in the context of 'fromcpu'. If + * vm_activate_cpu() is delayed until newcpu's pthread starts running + * then vmm.ko is out-of-sync with bhyve and this can create a race + * with vm_suspend(). + */ + error = vm_activate_cpu(ctx, newcpu); + assert(error == 0); + + CPU_SET_ATOMIC(newcpu, &cpumask); + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[newcpu].rip = rip; + vmexit[newcpu].inst_length = 0; + + mt_vmm_info[newcpu].mt_ctx = ctx; + mt_vmm_info[newcpu].mt_vcpu = newcpu; + + error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[newcpu]); + assert(error == 0); +} + +static int +vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, + uint32_t eax) +{ +#if BHYVE_DEBUG + /* + * put guest-driven debug here + */ +#endif + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out, string; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + string = vme->u.inout.string; + in = vme->u.inout.in; + out = !in; + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + return (error); + } + + error = emulate_inout(ctx, vcpu, vme, strictio); + if (error) { + fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), + port, vmexit->rip); + return (VMEXIT_ABORT); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + uint64_t val; + uint32_t eax, edx; + int error; + + val = 0; + error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); + if (error != 0) { + fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", + vme->u.msr.code, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + + eax = val; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); + assert(error == 0); + + edx = val >> 32; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); + assert(error == 0); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + + error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); + if (error != 0) { + fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", + vme->u.msr.code, vme->u.msr.wval, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int newcpu; + int retval = VMEXIT_CONTINUE; + + newcpu = spinup_ap(ctx, *pvcpu, + vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (retval); +} + +static int +vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tVMX\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); + fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + fprintf(stderr, "\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); + fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); + + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_bogus++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_hlt++; + + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); +} + +static int +vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_pause++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_mtrap++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int err, i; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vmexit->u.inst_emul.vie; + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, + vie, &vmexit->u.inst_emul.paging); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", + vmexit->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction ["); + for (i = 0; i < vie->num_valid; i++) { + fprintf(stderr, "0x%02x%s", vie->inst[i], + i != (vie->num_valid - 1) ? " " : ""); + } + fprintf(stderr, "] at 0x%lx\n", vmexit->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) +{ +#ifdef __FreeBSD__ + cpuset_t mask; +#endif + int error, rc, prevcpu; + enum vm_exitcode exitcode; + +#ifdef __FreeBSD__ + if (pincpu >= 0) { + CPU_ZERO(&mask); + CPU_SET(pincpu + vcpu, &mask); + error = pthread_setaffinity_np(pthread_self(), + sizeof(mask), &mask); + assert(error == 0); + } +#endif + + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + + while (1) { + error = vm_run(ctx, vcpu, &vmexit[vcpu]); + if (error != 0) + break; + + prevcpu = vcpu; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(1); + } + + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + break; + case VMEXIT_ABORT: + abort(); + default: + exit(1); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + +static int +num_vcpus_allowed(struct vmctx *ctx) +{ + int tmp, error; + + error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); + + /* + * The guest is allowed to spinup more than one processor only if the + * UNRESTRICTED_GUEST capability is available. + */ + if (error == 0) + return (VM_MAXCPU); + else + return (1); +} + +void +fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) +{ + int err, tmp; + + if (fbsdrun_vmexit_on_hlt()) { + err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, "VM exit on HLT not supported\n"); + exit(1); + } + vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, + "SMP mux requested, no pause support\n"); + exit(1); + } + vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (x2apic_mode) + err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); + else + err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); + + if (err) { + fprintf(stderr, "Unable to set x2apic state (%d)\n", err); + exit(1); + } + +#ifdef __FreeBSD__ + vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); +#endif +} + +int +main(int argc, char *argv[]) +{ + int c, error, gdb_port, rfb_port, err, bvmcons; + int max_vcpus; + struct vmctx *ctx; + uint64_t rip; + size_t memsize; + + bvmcons = 0; + progname = basename(argv[0]); + gdb_port = 0; + rfb_port = -1; + guest_ncpus = 1; + memsize = 256 * MB; + + +#ifdef __FreeBSD__ + while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) { +#else + while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) { +#endif + switch (c) { + case 'a': + x2apic_mode = 0; + break; +#ifdef __FreeBSD__ + case 'A': + acpi = 1; + break; +#endif + case 'b': + bvmcons = 1; + break; +#ifdef __FreeBSD__ + case 'p': + pincpu = atoi(optarg); + break; +#endif + case 'r': + if (optarg[0] == ':') + rfb_port = atoi(optarg + 1) + RFB_PORT; + else + rfb_port = atoi(optarg); + break; + case 'c': + guest_ncpus = atoi(optarg); + break; +#ifdef __FreeBSD__ + case 'g': + gdb_port = atoi(optarg); + break; +#endif + case 'l': + if (lpc_device_parse(optarg) != 0) { + errx(EX_USAGE, "invalid lpc device " + "configuration '%s'", optarg); + } + break; + case 's': + if (pci_parse_slot(optarg) != 0) + exit(1); + else + break; + case 'm': + error = vm_parse_memsize(optarg, &memsize); + if (error) + errx(EX_USAGE, "invalid memsize '%s'", optarg); + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'I': + /* + * The "-I" option was used to add an ioapic to the + * virtual machine. + * + * An ioapic is now provided unconditionally for each + * virtual machine and this option is now deprecated. + */ + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'e': + strictio = 1; + break; + case 'U': + guest_uuid_str = optarg; + break; + case 'W': + virtio_msix = 0; + break; + case 'x': + x2apic_mode = 1; + break; + case 'h': + usage(0); + default: + usage(1); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(1); + + vmname = argv[0]; + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + max_vcpus = num_vcpus_allowed(ctx); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(1); + } + + fbsdrun_set_capabilities(ctx, BSP); + + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + if (err) { + fprintf(stderr, "Unable to setup memory (%d)\n", err); + exit(1); + } + + error = init_msr(); + if (error) { + fprintf(stderr, "init_msr error %d", error); + exit(1); + } + + init_mem(); + init_inout(); + atkbdc_init(ctx); + pci_irq_init(ctx); + ioapic_init(ctx); + + rtc_init(ctx); + + /* + * Exit if a device emulation finds an error in it's initilization + */ + if (init_pci(ctx) != 0) + exit(1); + +#ifdef __FreeBSD__ + if (gdb_port != 0) + init_dbgport(gdb_port); +#endif + + if (bvmcons) + init_bvmcons(); + + console_init(); + vga_init(); + if (rfb_port != -1) + rfb_init(rfb_port); + + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + /* + * build the guest tables, MP etc. + */ + mptable_build(ctx, guest_ncpus); + + error = smbios_build(ctx); + assert(error == 0); + +#ifdef __FreeBSD__ + if (acpi) { + error = acpi_build(ctx, guest_ncpus); + assert(error == 0); + } + + /* + * Change the proc title to include the VM name. + */ + setproctitle("%s", vmname); +#else + /* + * If applicable, wait for bhyveconsole + */ + if (bcons_wait) { + printf("Waiting for bhyveconsole connection...\n"); + (void) pthread_mutex_lock(&bcons_wait_lock); + while (!bcons_connected) { + (void) pthread_cond_wait(&bcons_wait_done, + &bcons_wait_lock); + } + (void) pthread_mutex_unlock(&bcons_wait_lock); + } +#endif + + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, BSP, rip); + + /* + * Head off to the main event dispatch loop + */ +#ifdef __FreeBSD__ + mevent_dispatch(); +#else + pthread_exit(NULL); +#endif + + exit(1); +} diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h new file mode 100644 index 0000000000..be89314c09 --- /dev/null +++ b/usr/src/cmd/bhyve/bhyverun.h @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) + +struct vmctx; +extern int guest_ncpus; +extern char *guest_uuid_str; +extern char *vmname; +#ifndef __FreeBSD__ +extern int bcons_wait; +extern int bcons_connected; +extern pthread_mutex_t bcons_wait_lock; +extern pthread_cond_t bcons_wait_done; +#endif + +void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); + +void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); +void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_disable_x2apic(void); +int fbsdrun_virtio_msix(void); +#endif diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c new file mode 100644 index 0000000000..2da946d420 --- /dev/null +++ b/usr/src/cmd/bhyve/block_if.c @@ -0,0 +1,625 @@ +/*- + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/disk.h> + +#include <assert.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <pthread_np.h> +#include <signal.h> +#include <unistd.h> + +#include <machine/atomic.h> + +#include "bhyverun.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "block_if.h" + +#define BLOCKIF_SIG 0xb109b109 + +#define BLOCKIF_MAXREQ 33 + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH +}; + +enum blockstat { + BST_FREE, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; +}; + +struct blockif_ctxt { + int bc_magic; + int bc_fd; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + pthread_t bc_btid; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + int bc_closing; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + u_int bc_req_count; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be; + + assert(bc->bc_req_count < BLOCKIF_MAXREQ); + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_status = BST_PEND; + be->be_req = breq; + be->be_op = op; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + + bc->bc_req_count++; + + return (0); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + if (bc->bc_req_count == 0) + return (ENOENT); + + be = TAILQ_FIRST(&bc->bc_pendq); + assert(be != NULL); + assert(be->be_status == BST_PEND); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = bc->bc_btid; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + + *bep = be; + + return (0); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + assert(be->be_status == BST_DONE); + + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); + + bc->bc_req_count--; +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_req *br; + int err; + + br = be->be_req; + err = 0; + + switch (be->be_op) { + case BOP_READ: + if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset) < 0) + err = errno; + break; + case BOP_WRITE: + if (bc->bc_rdonly) + err = EROFS; + else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset) < 0) + err = errno; + break; + case BOP_FLUSH: + break; + default: + err = EINVAL; + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + + bc = arg; + + for (;;) { + pthread_mutex_lock(&bc->bc_mtx); + while (!blockif_dequeue(bc, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * Check ctxt status here to see if exit requested + */ + if (bc->bc_closing) + pthread_exit(NULL); + } + + /* Not reached */ + return (NULL); +} + +#ifdef __FreeBSD__ +static void +blockif_sigcont_handler(int signal, enum ev_type type, void *arg) +#else +static void +blockif_sigcont_handler(int signal) +#endif +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ +#ifdef __FreeBSD__ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +#else + (void) sigset(SIGCONT, blockif_sigcont_handler); +#endif +} + +struct blockif_ctxt * +blockif_open(const char *optstr, const char *ident) +{ + char tname[MAXCOMLEN + 1]; + char *nopt, *xopts; + struct blockif_ctxt *bc; + struct stat sbuf; + off_t size; + int extra, fd, i, sectsz; + int nocache, sync, ro; + + pthread_once(&blockif_once, blockif_init); + + nocache = 0; + sync = 0; + ro = 0; + + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = strdup(optstr); + for (xopts = strtok(nopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (!strcmp(xopts, "nocache")) + nocache = 1; + else if (!strcmp(xopts, "sync")) + sync = 1; + else if (!strcmp(xopts, "ro")) + ro = 1; + } + + extra = 0; + if (nocache) + extra |= O_DIRECT; + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + perror("Could not open backing file"); + return (NULL); + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + close(fd); + return (NULL); + } + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; +#ifdef __FreeBSD__ + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + close(fd); + return (NULL); + } + assert(size != 0); + assert(sectsz != 0); + } +#endif + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + close(fd); + return (NULL); + } + + bc->bc_magic = BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + bc->bc_req_count = 0; + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); + + snprintf(tname, sizeof(tname), "blk-%s", ident); + pthread_set_name_np(bc->bc_btid, tname); + + return (bc); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (bc->bc_req_count < BLOCKIF_MAXREQ) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + blockif_enqueue(bc, breq, op); + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); + bc->bc_req_count--; + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int err; + + err = 0; + + assert(bc->bc_magic == BLOCKIF_SIG); + + /* + * Stop the block i/o thread + */ + bc->bc_closing = 1; + pthread_cond_signal(&bc->bc_cond); + pthread_join(bc->bc_btid, &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535UL*16*255) + sectors = 65535UL*16*255; + + if (sectors >= 65536UL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_sectsz); +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_rdonly); +} diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h new file mode 100644 index 0000000000..5ef120933c --- /dev/null +++ b/usr/src/cmd/bhyve/block_if.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $ + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#ifndef _BLOCK_IF_H_ +#define _BLOCK_IF_H_ + +#include <sys/uio.h> +#include <sys/unistd.h> + +#ifdef __FreeBSD__ +#define BLOCKIF_IOV_MAX 32 /* not practical to be IOV_MAX */ +#else +#define BLOCKIF_IOV_MAX 16 /* not practical to be IOV_MAX */ +#endif + +struct blockif_req { + struct iovec br_iov[BLOCKIF_IOV_MAX]; + int br_iovcnt; + off_t br_offset; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; +}; + +struct blockif_ctxt; +struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); + +#endif /* _BLOCK_IF_H_ */ diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c new file mode 100644 index 0000000000..a8d07709be --- /dev/null +++ b/usr/src/cmd/bhyve/console.c @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include "bhyvegc.h" +#include "console.h" + +static struct { + struct bhyvegc *gc; + + fb_render_func_t fb_render_cb; + void *fb_arg; + + kbd_event_func_t kbd_event_cb; + void *kbd_arg; + + ptr_event_func_t ptr_event_cb; + void *ptr_arg; +} console; + +void +console_init(void) +{ + console.gc = bhyvegc_init(640, 400); +} + +struct bhyvegc_image * +console_get_image(void) +{ + struct bhyvegc_image *bhyvegc_image; + + bhyvegc_image = bhyvegc_get_image(console.gc); + + return (bhyvegc_image); +} + +void +console_fb_register(fb_render_func_t render_cb, void *arg) +{ + console.fb_render_cb = render_cb; + console.fb_arg = arg; +} + +void +console_refresh(void) +{ + (*console.fb_render_cb)(console.gc, console.fb_arg); +} + +void +console_kbd_register(kbd_event_func_t event_cb, void *arg) +{ + console.kbd_event_cb = event_cb; + console.kbd_arg = arg; +} + +void +console_ptr_register(ptr_event_func_t event_cb, void *arg) +{ + console.ptr_event_cb = event_cb; + console.ptr_arg = arg; +} + +void +console_key_event(int down, uint32_t keysym) +{ + (*console.kbd_event_cb)(down, keysym, console.kbd_arg); +} + +void +console_ptr_event(uint8_t button, int x, int y) +{ + (*console.ptr_event_cb)(button, x, y, console.ptr_arg); +} diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h new file mode 100644 index 0000000000..bffb7c2456 --- /dev/null +++ b/usr/src/cmd/bhyve/console.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CONSOLE_H_ +#define _CONSOLE_H_ + +struct bhyvegc; + +typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg); +typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg); +typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg); + +void console_init(void); +struct bhyvegc_image *console_get_image(void); + +void console_fb_register(fb_render_func_t render_cb, void *arg); +void console_refresh(void); + +void console_kbd_register(kbd_event_func_t event_cb, void *arg); +void console_key_event(int down, uint32_t keysym); + +void console_ptr_register(ptr_event_func_t event_cb, void *arg); +void console_ptr_event(uint8_t button, int x, int y); + +#endif /* _CONSOLE_H_ */ diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c new file mode 100644 index 0000000000..69b6dfddf1 --- /dev/null +++ b/usr/src/cmd/bhyve/consport.c @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $"); + +#include <sys/types.h> +#include <sys/select.h> + +#include <stdio.h> +#include <stdlib.h> +#include <termios.h> +#include <unistd.h> +#include <stdbool.h> + +#include "inout.h" +#include "pci_lpc.h" + +#define BVM_CONSOLE_PORT 0x220 +#define BVM_CONS_SIG ('b' << 8 | 'v') + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ +#ifdef __FreeBSD__ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +#endif +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static int opened; + + if (bytes == 2 && in) { + *eax = BVM_CONS_SIG; + return (0); + } + + /* + * Guests might probe this port to look for old ISA devices + * using single-byte reads. Return 0xff for those. + */ + if (bytes == 1 && in) { + *eax = 0xff; + return (0); + } + + if (bytes != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (in) + *eax = ttyread(); + else + ttywrite(*eax); + + return (0); +} + +SYSRES_IO(BVM_CONSOLE_PORT, 4); + +static struct inout_port consport = { + "bvmcons", + BVM_CONSOLE_PORT, + 1, + IOPORT_F_INOUT, + console_handler +}; + +void +init_bvmcons(void) +{ + + register_inout(&consport); +} diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h new file mode 100644 index 0000000000..b95df0bd31 --- /dev/null +++ b/usr/src/cmd/bhyve/dbgport.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $ + */ + +#ifndef _DBGPORT_H_ +#define _DBGPORT_H_ + +void init_dbgport(int port); + +#endif diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c new file mode 100644 index 0000000000..510649893a --- /dev/null +++ b/usr/src/cmd/bhyve/inout.c @@ -0,0 +1,297 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "bhyverun.h" +#include "inout.h" + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +#define VERIFY_IOPORT(port, size) \ + assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) + +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; + +static int +default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); +} + +static void +register_default_iohandler(int start, int size) +{ + struct inout_port iop; + + VERIFY_IOPORT(start, size); + + bzero(&iop, sizeof(iop)); + iop.name = "default"; + iop.port = start; + iop.size = size; + iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; + iop.handler = default_inout; + + register_inout(&iop); +} + +int +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) +{ + int addrsize, bytes, flags, in, port, prot, rep; + uint32_t eax, val; + inout_func_t handler; + void *arg; + int error, retval; + enum vm_reg_name idxreg; + uint64_t gla, index, iterations, count; + struct vm_inout_str *vis; + struct iovec iov[2]; + + bytes = vmexit->u.inout.bytes; + in = vmexit->u.inout.in; + port = vmexit->u.inout.port; + + assert(port < MAX_IOPORTS); + assert(bytes == 1 || bytes == 2 || bytes == 4); + + handler = inout_handlers[port].handler; + + if (strict && handler == default_inout) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if (in) { + if (!(flags & IOPORT_F_IN)) + return (-1); + } else { + if (!(flags & IOPORT_F_OUT)) + return (-1); + } + + retval = 0; + if (vmexit->u.inout.string) { + vis = &vmexit->u.inout_str; + rep = vis->inout.rep; + addrsize = vis->addrsize; + prot = in ? PROT_WRITE : PROT_READ; + assert(addrsize == 2 || addrsize == 4 || addrsize == 8); + + /* Index register */ + idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + index = vis->index & vie_size2mask(addrsize); + + /* Count register */ + count = vis->count & vie_size2mask(addrsize); + + /* Limit number of back-to-back in/out emulations to 16 */ + iterations = MIN(count, 16); + while (iterations > 0) { + assert(retval == 0); + if (vie_calculate_gla(vis->paging.cpu_mode, + vis->seg_name, &vis->seg_desc, index, bytes, + addrsize, prot, &gla)) { + vm_inject_gp(ctx, vcpu); + break; + } + + error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, + bytes, prot, iov, nitems(iov)); + if (error == -1) { + retval = -1; /* Unrecoverable error */ + break; + } else if (error == 1) { + retval = 0; /* Resume guest to handle fault */ + break; + } + + if (vie_alignment_check(vis->paging.cpl, bytes, + vis->cr0, vis->rflags, gla)) { + vm_inject_ac(ctx, vcpu, 0); + break; + } + + val = 0; + if (!in) + vm_copyin(ctx, vcpu, iov, &val, bytes); + + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval != 0) + break; + + if (in) + vm_copyout(ctx, vcpu, &val, iov, bytes); + + /* Update index */ + if (vis->rflags & PSL_D) + index -= bytes; + else + index += bytes; + + count--; + iterations--; + } + + /* Update index register */ + error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); + assert(error == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if (rep) { + error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, + count, addrsize); + assert(error == 0); + } + + /* Restart the instruction if more iterations remain */ + if (retval == 0 && count != 0) { + error = vm_restart_instruction(ctx, vcpu); + assert(error == 0); + } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval == 0 && in) { + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + eax); + assert(error == 0); + } + } + return (retval); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + /* + * Set up the default handler for all ports + */ + register_default_iohandler(0, MAX_IOPORTS); + + /* + * Overwrite with specified handlers + */ + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + int i; + + VERIFY_IOPORT(iop->port, iop->size); + + /* + * Verify that the new registration is not overwriting an already + * allocated i/o range. + */ + if ((iop->flags & IOPORT_F_DEFAULT) == 0) { + for (i = iop->port; i < iop->port + iop->size; i++) { + if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) + return (-1); + } + } + + for (i = iop->port; i < iop->port + iop->size; i++) { + inout_handlers[i].name = iop->name; + inout_handlers[i].flags = iop->flags; + inout_handlers[i].handler = iop->handler; + inout_handlers[i].arg = iop->arg; + } + + return (0); +} + +int +unregister_inout(struct inout_port *iop) +{ + + VERIFY_IOPORT(iop->port, iop->size); + assert(inout_handlers[iop->port].name == iop->name); + + register_default_iohandler(iop->port, iop->size); + + return (0); +} diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h new file mode 100644 index 0000000000..0d4046bd61 --- /dev/null +++ b/usr/src/cmd/bhyve/inout.h @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _INOUT_H_ +#define _INOUT_H_ + +#include <sys/linker_set.h> + +struct vmctx; +struct vm_exit; + +/* + * inout emulation handlers return 0 on success and -1 on failure. + */ +typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +struct inout_port { + const char *name; + int port; + int size; + int flags; + inout_func_t handler; + void *arg; +}; +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) + +/* + * The following flags are used internally and must not be used by + * device models. + */ +#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ + #name, \ + (port), \ + 1, \ + (flags), \ + (handler), \ + 0 \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) + +void init_inout(void); +int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, + int strict); +int register_inout(struct inout_port *iop); +int unregister_inout(struct inout_port *iop); +void init_bvmcons(void); + +#endif /* _INOUT_H_ */ diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c new file mode 100644 index 0000000000..86ff5c6580 --- /dev/null +++ b/usr/src/cmd/bhyve/ioapic.c @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 2014 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $"); + +#include <sys/types.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "ioapic.h" + +/* + * Assign PCI INTx interrupts to I/O APIC pins in a round-robin + * fashion. Note that we have no idea what the HPET is using, but the + * HPET is also programmable whereas this is intended for hardwired + * PCI interrupts. + * + * This assumes a single I/O APIC where pins >= 16 are permitted for + * PCI devices. + */ +static int pci_pins; + +void +ioapic_init(struct vmctx *ctx) +{ + + if (vm_ioapic_pincount(ctx, &pci_pins) < 0) { + pci_pins = 0; + return; + } + + /* Ignore the first 16 pins. */ + if (pci_pins <= 16) { + pci_pins = 0; + return; + } + pci_pins -= 16; +} + +int +ioapic_pci_alloc_irq(void) +{ + static int last_pin; + + if (pci_pins == 0) + return (-1); + return (16 + (last_pin++ % pci_pins)); +} diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h new file mode 100644 index 0000000000..789f90fea9 --- /dev/null +++ b/usr/src/cmd/bhyve/ioapic.h @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2014 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $ + */ + +#ifndef _IOAPIC_H_ +#define _IOAPIC_H_ + +/* + * Allocate a PCI IRQ from the I/O APIC. + */ +void ioapic_init(struct vmctx *ctx); +int ioapic_pci_alloc_irq(void); + +#endif diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c new file mode 100644 index 0000000000..a153a8e960 --- /dev/null +++ b/usr/src/cmd/bhyve/mem.c @@ -0,0 +1,291 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $"); + +#include <sys/types.h> +#include <sys/tree.h> +#include <sys/errno.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <pthread.h> + +#include "mem.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + pthread_rwlock_unlock(&mmio_rwlock); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct mmio_rb_range *entry; + int err, immutable; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + pthread_rwlock_unlock(&mmio_rwlock); + return (ESRCH); + } + } + + assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, + mem_read, mem_write, &entry->mr_param); + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + return (err); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + pthread_rwlock_unlock(&mmio_rwlock); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + pthread_rwlock_unlock(&mmio_rwlock); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h new file mode 100644 index 0000000000..09cf56b72e --- /dev/null +++ b/usr/src/cmd/bhyve/mem.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include <sys/linker_set.h> + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging); + +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); + +#endif /* _MEM_H_ */ diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c new file mode 100644 index 0000000000..9d03765c7a --- /dev/null +++ b/usr/src/cmd/bhyve/mptbl.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <x86/mptable.h> + +#include <stdio.h> +#include <string.h> + +#include "acpi.h" +#include "bhyverun.h" +#include "mptbl.h" +#include "pci_emul.h" + +#define MPTABLE_BASE 0xE0000 + +/* floating pointer length + maximum length of configuration table */ +#define MPTABLE_MAX_LENGTH (65536 + 16) + +#define LAPIC_PADDR 0xFEE00000 +#define LAPIC_VERSION 16 + +#define IOAPIC_PADDR 0xFEC00000 +#define IOAPIC_VERSION 0x11 + +#define MP_SPECREV 4 +#define MPFP_SIG "_MP_" + +/* Configuration header defines */ +#define MPCH_SIG "PCMP" +#define MPCH_OEMID "BHyVe " +#define MPCH_OEMID_LEN 8 +#define MPCH_PRODID "Hypervisor " +#define MPCH_PRODID_LEN 12 + +/* Processor entry defines */ +#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */ +#define MPEP_SIG_MODEL 26 +#define MPEP_SIG_STEPPING 5 +#define MPEP_SIG \ + ((MPEP_SIG_FAMILY << 8) | \ + (MPEP_SIG_MODEL << 4) | \ + (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */ + +/* Number of local intr entries */ +#define MPEII_NUM_LOCAL_IRQ 2 + +/* Bus entry defines */ +#define MPE_NUM_BUSES 2 +#define MPE_BUSNAME_LEN 6 +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " + +static void *oem_tbl_start; +static int oem_tbl_size; + +static uint8_t +mpt_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes; + uint8_t sum; + + for(bytes = base, sum = 0; len > 0; len--) { + sum += *bytes++; + } + + return (256 - sum); +} + +static void +mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa) +{ + + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIG, 4); + mpfp->pap = gpa + sizeof(*mpfp); + mpfp->length = 1; + mpfp->spec_rev = MP_SPECREV; + mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mpt_build_mpch(mpcth_t mpch) +{ + + memset(mpch, 0, sizeof(*mpch)); + memcpy(mpch->signature, MPCH_SIG, 4); + mpch->spec_rev = MP_SPECREV; + memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->apic_address = LAPIC_PADDR; +} + +static void +mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu) +{ + int i; + + for (i = 0; i < ncpu; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->type = MPCT_ENTRY_PROCESSOR; + mpep->apic_id = i; // XXX + mpep->apic_version = LAPIC_VERSION; + mpep->cpu_flags = PROCENTRY_FLAG_EN; + if (i == 0) + mpep->cpu_flags |= PROCENTRY_FLAG_BP; + mpep->cpu_signature = MPEP_SIG; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } +} + +static void +mpt_build_localint_entries(int_entry_ptr mpie) +{ + + /* Hardcode LINT0 as ExtINT on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_EXTINT; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 0; + mpie++; + + /* Hardcode LINT1 as NMI on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_NMI; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 1; +} + +static void +mpt_build_bus_entries(bus_entry_ptr mpeb) +{ + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 0; + memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 1; + memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); +} + +static void +mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id) +{ + + memset(mpei, 0, sizeof(*mpei)); + mpei->type = MPCT_ENTRY_IOAPIC; + mpei->apic_id = id; + mpei->apic_version = IOAPIC_VERSION; + mpei->apic_flags = IOAPICENTRY_FLAG_EN; + mpei->apic_address = IOAPIC_PADDR; +} + +static int +mpt_count_ioint_entries(void) +{ + int bus, count; + + count = 0; + for (bus = 0; bus <= PCI_BUSMAX; bus++) + count += pci_count_lintr(bus); + + /* + * Always include entries for the first 16 pins along with a entry + * for each active PCI INTx pin. + */ + return (16 + count); +} + +static void +mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + int_entry_ptr *mpiep, mpie; + + mpiep = arg; + mpie = *mpiep; + memset(mpie, 0, sizeof(*mpie)); + + /* + * This is always after another I/O interrupt entry, so cheat + * and fetch the I/O APIC ID from the prior entry. + */ + mpie->type = MPCT_ENTRY_INT; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_id = bus; + mpie->src_bus_irq = slot << 2 | (pin - 1); + mpie->dst_apic_id = mpie[-1].dst_apic_id; + mpie->dst_apic_int = ioapic_irq; + + *mpiep = mpie + 1; +} + +static void +mpt_build_ioint_entries(int_entry_ptr mpie, int id) +{ + int pin, bus; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + /* First, generate the first 16 pins. */ + for (pin = 0; pin < 16; pin++) { + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_INT; + mpie->src_bus_id = 1; + mpie->dst_apic_id = id; + + /* + * All default configs route IRQs from bus 0 to the first 16 + * pins of the first I/O APIC with an APIC ID of 2. + */ + mpie->dst_apic_int = pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpie->int_type = INTENTRY_TYPE_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = 0; + break; + case SCI_INT: + /* ACPI SCI is level triggered and active-lo. */ + mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO | + INTENTRY_FLAGS_TRIGGER_LEVEL; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = SCI_INT; + break; + default: + /* All other pins are identity mapped. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = pin; + break; + } + mpie++; + } + + /* Next, generate entries for any PCI INTx interrupts. */ + for (bus = 0; bus <= PCI_BUSMAX; bus++) + pci_walk_lintr(bus, mpt_generate_pci_int, &mpie); +} + +void +mptable_add_oemtbl(void *tbl, int tblsz) +{ + + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +mptable_build(struct vmctx *ctx, int ncpu) +{ + mpcth_t mpch; + bus_entry_ptr mpeb; + io_apic_entry_ptr mpei; + proc_entry_ptr mpep; + mpfps_t mpfp; + int_entry_ptr mpie; + int ioints, bus; + char *curraddr; + char *startaddr; + + startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "mptable requires mapped mem\n"); + return (ENOMEM); + } + + /* + * There is no way to advertise multiple PCI hierarchies via MPtable + * so require that there is no PCI hierarchy with a non-zero bus + * number. + */ + for (bus = 1; bus <= PCI_BUSMAX; bus++) { + if (pci_bus_configured(bus)) { + fprintf(stderr, "MPtable is incompatible with " + "multiple PCI hierarchies.\r\n"); + fprintf(stderr, "MPtable generation can be disabled " + "by passing the -Y option to bhyve(8).\r\n"); + return (EINVAL); + } + } + + curraddr = startaddr; + mpfp = (mpfps_t)curraddr; + mpt_build_mpfp(mpfp, MPTABLE_BASE); + curraddr += sizeof(*mpfp); + + mpch = (mpcth_t)curraddr; + mpt_build_mpch(mpch); + curraddr += sizeof(*mpch); + + mpep = (proc_entry_ptr)curraddr; + mpt_build_proc_entries(mpep, ncpu); + curraddr += sizeof(*mpep) * ncpu; + mpch->entry_count += ncpu; + + mpeb = (bus_entry_ptr) curraddr; + mpt_build_bus_entries(mpeb); + curraddr += sizeof(*mpeb) * MPE_NUM_BUSES; + mpch->entry_count += MPE_NUM_BUSES; + + mpei = (io_apic_entry_ptr)curraddr; + mpt_build_ioapic_entries(mpei, 0); + curraddr += sizeof(*mpei); + mpch->entry_count++; + + mpie = (int_entry_ptr) curraddr; + ioints = mpt_count_ioint_entries(); + mpt_build_ioint_entries(mpie, 0); + curraddr += sizeof(*mpie) * ioints; + mpch->entry_count += ioints; + + mpie = (int_entry_ptr)curraddr; + mpt_build_localint_entries(mpie); + curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ; + mpch->entry_count += MPEII_NUM_LOCAL_IRQ; + + if (oem_tbl_start) { + mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE; + mpch->oem_table_size = oem_tbl_size; + memcpy(curraddr, oem_tbl_start, oem_tbl_size); + } + + mpch->base_table_length = curraddr - (char *)mpch; + mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length); + + return (0); +} diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h new file mode 100644 index 0000000000..d78ea6da09 --- /dev/null +++ b/usr/src/cmd/bhyve/mptbl.h @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $ + */ + +#ifndef _MPTBL_H_ +#define _MPTBL_H_ + +int mptable_build(struct vmctx *ctx, int ncpu); +void mptable_add_oemtbl(void *tbl, int tblsz); + +#endif /* _MPTBL_H_ */ diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c new file mode 100644 index 0000000000..b68c977c1f --- /dev/null +++ b/usr/src/cmd/bhyve/pci_ahci.c @@ -0,0 +1,2009 @@ +/*- + * Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <sys/disk.h> +#include <sys/ata.h> +#include <sys/endian.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> +#include <inttypes.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "ahci.h" +#include "block_if.h" + +#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ + +#define PxSIG_ATA 0x00000101 /* ATA drive */ +#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ + +enum sata_fis_type { + FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ + FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ + FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ + FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ + FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ + FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ + FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ + FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ +}; + +/* + * SCSI opcodes + */ +#define TEST_UNIT_READY 0x00 +#define REQUEST_SENSE 0x03 +#define INQUIRY 0x12 +#define START_STOP_UNIT 0x1B +#define PREVENT_ALLOW 0x1E +#define READ_CAPACITY 0x25 +#define READ_10 0x28 +#define POSITION_TO_ELEMENT 0x2B +#define READ_TOC 0x43 +#define GET_EVENT_STATUS_NOTIFICATION 0x4A +#define MODE_SENSE_10 0x5A +#define READ_12 0xA8 +#define READ_CD 0xBE + +/* + * SCSI mode page codes + */ +#define MODEPAGE_RW_ERROR_RECOVERY 0x01 +#define MODEPAGE_CD_CAPABILITIES 0x2A + +/* + * ATA commands + */ +#define ATA_SF_ENAB_SATA_SF 0x10 +#define ATA_SATA_SF_AN 0x05 +#define ATA_SF_DIS_SATA_SF 0x90 + +/* + * Debug printf + */ +#ifdef AHCI_DEBUG +static FILE *dbg; +#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) +#else +#define DPRINTF(format, arg...) +#endif +#define WPRINTF(format, arg...) printf(format, ##arg) + +struct ahci_ioreq { + struct blockif_req io_req; + struct ahci_port *io_pr; + STAILQ_ENTRY(ahci_ioreq) io_flist; + TAILQ_ENTRY(ahci_ioreq) io_blist; + uint8_t *cfis; + uint32_t len; + uint32_t done; + int slot; + int prdtl; +}; + +struct ahci_port { + struct blockif_ctxt *bctx; + struct pci_ahci_softc *pr_sc; + uint8_t *cmd_lst; + uint8_t *rfis; + int atapi; + int reset; + int mult_sectors; + uint8_t xfermode; + uint8_t sense_key; + uint8_t asc; + uint32_t pending; + + uint32_t clb; + uint32_t clbu; + uint32_t fb; + uint32_t fbu; + uint32_t is; + uint32_t ie; + uint32_t cmd; + uint32_t unused0; + uint32_t tfd; + uint32_t sig; + uint32_t ssts; + uint32_t sctl; + uint32_t serr; + uint32_t sact; + uint32_t ci; + uint32_t sntf; + uint32_t fbs; + + /* + * i/o request info + */ + struct ahci_ioreq *ioreq; + int ioqsz; + STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; + TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; +}; + +struct ahci_cmd_hdr { + uint16_t flags; + uint16_t prdtl; + uint32_t prdbc; + uint64_t ctba; + uint32_t reserved[4]; +}; + +struct ahci_prdt_entry { + uint64_t dba; + uint32_t reserved; +#define DBCMASK 0x3fffff + uint32_t dbc; +}; + +struct pci_ahci_softc { + struct pci_devinst *asc_pi; + pthread_mutex_t mtx; + int ports; + uint32_t cap; + uint32_t ghc; + uint32_t is; + uint32_t pi; + uint32_t vs; + uint32_t ccc_ctl; + uint32_t ccc_pts; + uint32_t em_loc; + uint32_t em_ctl; + uint32_t cap2; + uint32_t bohc; + uint32_t lintr; + struct ahci_port port[MAX_PORTS]; +}; +#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) + +static inline void lba_to_msf(uint8_t *buf, int lba) +{ + lba += 150; + buf[0] = (lba / 75) / 60; + buf[1] = (lba / 75) % 60; + buf[2] = lba % 75; +} + +/* + * generate HBA intr depending on whether or not ports within + * the controller have an interrupt pending. + */ +static void +ahci_generate_intr(struct pci_ahci_softc *sc) +{ + struct pci_devinst *pi; + int i; + + pi = sc->asc_pi; + + for (i = 0; i < sc->ports; i++) { + struct ahci_port *pr; + pr = &sc->port[i]; + if (pr->is & pr->ie) + sc->is |= (1 << i); + } + + DPRINTF("%s %x\n", __func__, sc->is); + + if (sc->is && (sc->ghc & AHCI_GHC_IE)) { + if (pci_msi_enabled(pi)) { + /* + * Generate an MSI interrupt on every edge + */ + pci_generate_msi(pi, 0); + } else if (!sc->lintr) { + /* + * Only generate a pin-based interrupt if one wasn't + * in progress + */ + sc->lintr = 1; + pci_lintr_assert(pi); + } + } else if (sc->lintr) { + /* + * No interrupts: deassert pin-based signal if it had + * been asserted + */ + pci_lintr_deassert(pi); + sc->lintr = 0; + } +} + +static void +ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) +{ + int offset, len, irq; + + if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) + return; + + switch (ft) { + case FIS_TYPE_REGD2H: + offset = 0x40; + len = 20; + irq = AHCI_P_IX_DHR; + break; + case FIS_TYPE_SETDEVBITS: + offset = 0x58; + len = 8; + irq = AHCI_P_IX_SDB; + break; + case FIS_TYPE_PIOSETUP: + offset = 0x20; + len = 20; + irq = 0; + break; + default: + WPRINTF("unsupported fis type %d\n", ft); + return; + } + memcpy(p->rfis + offset, fis, len); + if (irq) { + p->is |= irq; + ahci_generate_intr(p->pr_sc); + } +} + +static void +ahci_write_fis_piosetup(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_PIOSETUP; + ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); +} + +static void +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd) +{ + uint8_t fis[8]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + memset(fis, 0, sizeof(fis)); + fis[0] = error; + fis[2] = tfd & 0x77; + *(uint32_t *)(fis + 4) = (1 << slot); + if (fis[2] & ATA_S_ERROR) + p->is |= AHCI_P_IX_TFE; + p->tfd = tfd; + ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); +} + +static void +ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[20]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = (1 << 6); + fis[2] = tfd & 0xff; + fis[3] = error; + fis[4] = cfis[4]; + fis[5] = cfis[5]; + fis[6] = cfis[6]; + fis[7] = cfis[7]; + fis[8] = cfis[8]; + fis[9] = cfis[9]; + fis[10] = cfis[10]; + fis[11] = cfis[11]; + fis[12] = cfis[12]; + fis[13] = cfis[13]; + if (fis[2] & ATA_S_ERROR) + p->is |= AHCI_P_IX_TFE; + else + p->ci &= ~(1 << slot); + p->tfd = tfd; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_reset_fis_d2h(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[3] = 1; + fis[4] = 1; + if (p->atapi) { + fis[5] = 0x14; + fis[6] = 0xeb; + } + fis[12] = 1; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_check_stopped(struct ahci_port *p) +{ + /* + * If we are no longer processing the command list and nothing + * is in-flight, clear the running bit, the current command + * slot, the command issue and active bits. + */ + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (p->pending == 0) { + p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); + p->ci = 0; + p->sact = 0; + } + } +} + +static void +ahci_port_stop(struct ahci_port *p) +{ + struct ahci_ioreq *aior; + uint8_t *cfis; + int slot; + int ncq; + int error; + + assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); + + TAILQ_FOREACH(aior, &p->iobhd, io_blist) { + /* + * Try to cancel the outstanding blockif request. + */ + error = blockif_cancel(p->bctx, &aior->io_req); + if (error != 0) + continue; + + slot = aior->slot; + cfis = aior->cfis; + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) + ncq = 1; + + if (ncq) + p->sact &= ~(1 << slot); + else + p->ci &= ~(1 << slot); + + /* + * This command is now done. + */ + p->pending &= ~(1 << slot); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + } + + ahci_check_stopped(p); +} + +static void +ahci_port_reset(struct ahci_port *pr) +{ + pr->sctl = 0; + pr->serr = 0; + pr->sact = 0; + pr->xfermode = ATA_UDMA6; + pr->mult_sectors = 128; + + if (!pr->bctx) { + pr->ssts = ATA_SS_DET_NO_DEVICE; + pr->sig = 0xFFFFFFFF; + pr->tfd = 0x7F; + return; + } + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 | + ATA_SS_IPM_ACTIVE; + pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; + if (!pr->atapi) { + pr->sig = PxSIG_ATA; + pr->tfd |= ATA_S_READY; + } else + pr->sig = PxSIG_ATAPI; + ahci_write_reset_fis_d2h(pr); +} + +static void +ahci_reset(struct pci_ahci_softc *sc) +{ + int i; + + sc->ghc = AHCI_GHC_AE; + sc->is = 0; + + if (sc->lintr) { + pci_lintr_deassert(sc->asc_pi); + sc->lintr = 0; + } + + for (i = 0; i < sc->ports; i++) { + sc->port[i].ie = 0; + sc->port[i].is = 0; + ahci_port_reset(&sc->port[i]); + } +} + +static void +ata_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i ^ 1] = *src++; + else + dest[i ^ 1] = ' '; + } +} + +static void +atapi_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i] = *src++; + else + dest[i] = ' '; + } +} + +static void +ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, + int seek) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + struct pci_ahci_softc *sc; + struct ahci_prdt_entry *prdt; + struct ahci_cmd_hdr *hdr; + uint64_t lba; + uint32_t len; + int i, err, iovcnt, ncq, readop; + + sc = p->pr_sc; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + ncq = 0; + readop = 1; + + prdt += seek; + if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) + readop = 0; + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[11] << 8 | cfis[3]; + if (!len) + len = 65536; + ncq = 1; + } else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[13] << 8 | cfis[12]; + if (!len) + len = 65536; + } else { + lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | + (cfis[5] << 8) | cfis[4]; + len = cfis[12]; + if (!len) + len = 256; + } + lba *= blockif_sectsz(p->bctx); + len *= blockif_sectsz(p->bctx); + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + iovcnt = hdr->prdtl - seek; + if (iovcnt > BLOCKIF_IOV_MAX) { + aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; + iovcnt = BLOCKIF_IOV_MAX; + } else + aior->prdtl = 0; + breq->br_iovcnt = iovcnt; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + /* + * Build up the iovec based on the prdt + */ + for (i = 0; i < iovcnt; i++) { + uint32_t dbcsz; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), + prdt->dba, dbcsz); + breq->br_iov[i].iov_len = dbcsz; + aior->done += dbcsz; + prdt++; + } + if (readop) + err = blockif_read(p->bctx, breq); + else + err = blockif_write(p->bctx, breq); + assert(err == 0); + + if (ncq) + p->ci &= ~(1 << slot); +} + +static void +ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + int err; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = 0; + aior->done = 0; + aior->prdtl = 0; + breq = &aior->io_req; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_flush(p->bctx, breq); + assert(err == 0); +} + +static inline void +write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *from; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + from = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = len < dbcsz ? len : dbcsz; + memcpy(ptr, from, sublen); + len -= sublen; + from += sublen; + prdt++; + } + hdr->prdbc = size - len; +} + +static void +handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0) { + p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + p->is |= AHCI_P_IX_TFE; + } else { + uint16_t buf[256]; + uint64_t sectors; + uint16_t cyl; + uint8_t sech, heads; + + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + blockif_chs(p->bctx, &cyl, &heads, &sech); + memset(buf, 0, sizeof(buf)); + buf[0] = 0x0040; + buf[1] = cyl; + buf[3] = heads; + buf[6] = sech; + /* TODO emulate different serial? */ + ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); + buf[47] = (0x8000 | 128); + buf[48] = 0x1; + buf[49] = (1 << 8 | 1 << 9 | 1 << 11); + buf[50] = (1 << 14); + buf[53] = (1 << 1 | 1 << 2); + if (p->mult_sectors) + buf[59] = (0x100 | p->mult_sectors); + buf[60] = sectors; + buf[61] = (sectors >> 16); + buf[63] = 0x7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 0x3; + buf[65] = 100; + buf[66] = 100; + buf[67] = 100; + buf[68] = 100; + buf[75] = 31; + buf[76] = (1 << 8 | 1 << 2); + buf[80] = 0x1f0; + buf[81] = 0x28; + buf[82] = (1 << 5 | 1 << 14); + buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14); + buf[84] = (1 << 14); + buf[85] = (1 << 5 | 1 << 14); + buf[86] = (1 << 10 | 1 << 12 | 1 << 13); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[93] = (1 | 1 <<14); + buf[100] = sectors; + buf[101] = (sectors >> 16); + buf[102] = (sectors >> 32); + buf[103] = (sectors >> 48); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + p->tfd = ATA_S_DSC | ATA_S_READY; + p->is |= AHCI_P_IX_DP; + p->ci &= ~(1 << slot); + } + ahci_generate_intr(p->pr_sc); +} + +static void +handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + if (!p->atapi) { + p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + p->is |= AHCI_P_IX_TFE; + } else { + uint16_t buf[256]; + + memset(buf, 0, sizeof(buf)); + buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); + /* TODO emulate different serial? */ + ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); + buf[49] = (1 << 9 | 1 << 8); + buf[50] = (1 << 14 | 1); + buf[53] = (1 << 2 | 1 << 1); + buf[62] = 0x3f; + buf[63] = 7; + buf[64] = 3; + buf[65] = 100; + buf[66] = 100; + buf[67] = 100; + buf[68] = 100; + buf[76] = (1 << 2 | 1 << 1); + buf[78] = (1 << 5); + buf[80] = (0x1f << 4); + buf[82] = (1 << 4); + buf[83] = (1 << 14); + buf[84] = (1 << 14); + buf[85] = (1 << 4); + buf[87] = (1 << 14); + buf[88] = (1 << 14 | 0x7f); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + p->tfd = ATA_S_DSC | ATA_S_READY; + p->is |= AHCI_P_IX_DHR; + p->ci &= ~(1 << slot); + } + ahci_generate_intr(p->pr_sc); +} + +static void +atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[36]; + uint8_t *acmd; + int len; + + acmd = cfis + 0x40; + + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + + len = sizeof(buf); + if (len > acmd[4]) + len = acmd[4]; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, len); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[8]; + uint64_t sectors; + + sectors = blockif_size(p->bctx) / 2048; + be32enc(buf, sectors - 1); + be32enc(buf + 4, 2048); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint8_t format; + int len; + + acmd = cfis + 0x40; + + len = be16dec(acmd + 7); + format = acmd[9] >> 6; + switch (format) { + case 0: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, buf[20], *bp; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + if (start_track > 1 && start_track != 0xaa) { + uint32_t tfd; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + if (start_track <= 1) { + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 1; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + } + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 0xaa; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 1: + { + uint8_t buf[12]; + + memset(buf, 0, sizeof(buf)); + buf[1] = 0xa; + buf[2] = 0x1; + buf[3] = 0x1; + if (len > sizeof(buf)) + len = sizeof(buf); + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 2: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, *bp, buf[50]; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa2; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + default: + { + uint32_t tfd; + + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + break; + } + } +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, + uint32_t done, int seek) +{ + struct ahci_ioreq *aior; + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct blockif_req *breq; + struct pci_ahci_softc *sc; + uint8_t *acmd; + uint64_t lba; + uint32_t len; + int i, err, iovcnt; + + sc = p->pr_sc; + acmd = cfis + 0x40; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + + prdt += seek; + lba = be32dec(acmd + 2); + if (acmd[0] == READ_10) + len = be16dec(acmd + 7); + else + len = be32dec(acmd + 6); + if (len == 0) { + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + } + lba *= 2048; + len *= 2048; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + iovcnt = hdr->prdtl - seek; + if (iovcnt > BLOCKIF_IOV_MAX) { + aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; + iovcnt = BLOCKIF_IOV_MAX; + } else + aior->prdtl = 0; + breq->br_iovcnt = iovcnt; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + /* + * Build up the iovec based on the prdt + */ + for (i = 0; i < iovcnt; i++) { + uint32_t dbcsz; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), + prdt->dba, dbcsz); + breq->br_iov[i].iov_len = dbcsz; + aior->done += dbcsz; + prdt++; + } + err = blockif_read(p->bctx, breq); + assert(err == 0); +} + +static void +atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[64]; + uint8_t *acmd; + int len; + + acmd = cfis + 0x40; + len = acmd[4]; + if (len > sizeof(buf)) + len = sizeof(buf); + memset(buf, 0, len); + buf[0] = 0x70 | (1 << 7); + buf[2] = p->sense_key; + buf[7] = 10; + buf[12] = p->asc; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd = cfis + 0x40; + uint32_t tfd; + + switch (acmd[4] & 3) { + case 0: + case 1: + case 3: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + tfd = ATA_S_READY | ATA_S_DSC; + break; + case 2: + /* TODO eject media */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x53; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + uint8_t pc, code; + int len; + + acmd = cfis + 0x40; + len = be16dec(acmd + 7); + pc = acmd[2] >> 6; + code = acmd[2] & 0x3f; + + switch (pc) { + case 0: + switch (code) { + case MODEPAGE_RW_ERROR_RECOVERY: + { + uint8_t buf[16]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 16 - 2); + buf[2] = 0x70; + buf[8] = 0x01; + buf[9] = 16 - 10; + buf[11] = 0x05; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + case MODEPAGE_CD_CAPABILITIES: + { + uint8_t buf[30]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 30 - 2); + buf[2] = 0x70; + buf[8] = 0x2A; + buf[9] = 30 - 10; + buf[10] = 0x08; + buf[12] = 0x71; + be16enc(&buf[18], 2); + be16enc(&buf[20], 512); + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + default: + goto error; + break; + } + break; + case 3: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x39; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; +error: + case 1: + case 2: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_get_event_status_notification(struct ahci_port *p, int slot, + uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + + acmd = cfis + 0x40; + + /* we don't support asynchronous operation */ + if (!(acmd[1] & 1)) { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + } else { + uint8_t buf[8]; + int len; + + len = be16dec(acmd + 7); + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 8 - 2); + buf[2] = 0x04; + buf[3] = 0x10; + buf[5] = 0x02; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + + acmd = cfis + 0x40; + +#ifdef AHCI_DEBUG + { + int i; + DPRINTF("ACMD:"); + for (i = 0; i < 16; i++) + DPRINTF("%02x ", acmd[i]); + DPRINTF("\n"); + } +#endif + + switch (acmd[0]) { + case TEST_UNIT_READY: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case INQUIRY: + atapi_inquiry(p, slot, cfis); + break; + case READ_CAPACITY: + atapi_read_capacity(p, slot, cfis); + break; + case PREVENT_ALLOW: + /* TODO */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case READ_TOC: + atapi_read_toc(p, slot, cfis); + break; + case READ_10: + case READ_12: + atapi_read(p, slot, cfis, 0, 0); + break; + case REQUEST_SENSE: + atapi_request_sense(p, slot, cfis); + break; + case START_STOP_UNIT: + atapi_start_stop_unit(p, slot, cfis); + break; + case MODE_SENSE_10: + atapi_mode_sense(p, slot, cfis); + break; + case GET_EVENT_STATUS_NOTIFICATION: + atapi_get_event_status_notification(p, slot, cfis); + break; + default: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x20; + ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | + ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + + switch (cfis[2]) { + case ATA_ATA_IDENTIFY: + handle_identify(p, slot, cfis); + break; + case ATA_SETFEATURES: + { + switch (cfis[3]) { + case ATA_SF_ENAB_SATA_SF: + switch (cfis[12]) { + case ATA_SATA_SF_AN: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + break; + case ATA_SF_ENAB_WCACHE: + case ATA_SF_DIS_WCACHE: + case ATA_SF_ENAB_RCACHE: + case ATA_SF_DIS_RCACHE: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + case ATA_SF_SETXFER: + { + switch (cfis[12] & 0xf8) { + case ATA_PIO: + case ATA_PIO0: + break; + case ATA_WDMA0: + case ATA_UDMA0: + p->xfermode = (cfis[12] & 0x7); + break; + } + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + } + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + } + case ATA_SET_MULTI: + if (cfis[12] != 0 && + (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + } else { + p->mult_sectors = cfis[12]; + p->tfd = ATA_S_DSC | ATA_S_READY; + } + p->is |= AHCI_P_IX_DP; + p->ci &= ~(1 << slot); + ahci_generate_intr(p->pr_sc); + break; + case ATA_READ_DMA: + case ATA_WRITE_DMA: + case ATA_READ_DMA48: + case ATA_WRITE_DMA48: + case ATA_READ_FPDMA_QUEUED: + case ATA_WRITE_FPDMA_QUEUED: + ahci_handle_dma(p, slot, cfis, 0, 0); + break; + case ATA_FLUSHCACHE: + case ATA_FLUSHCACHE48: + ahci_handle_flush(p, slot, cfis); + break; + case ATA_STANDBY_CMD: + break; + case ATA_NOP: + case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_IMMEDIATE: + case ATA_SLEEP: + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_ATAPI_IDENTIFY: + handle_atapi_identify(p, slot, cfis); + break; + case ATA_PACKET_CMD: + if (!p->atapi) { + p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + p->is |= AHCI_P_IX_TFE; + ahci_generate_intr(p->pr_sc); + } else + handle_packet_cmd(p, slot, cfis); + break; + default: + WPRINTF("Unsupported cmd:%02x\n", cfis[2]); + p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + p->is |= AHCI_P_IX_TFE; + ahci_generate_intr(p->pr_sc); + break; + } +} + +static void +ahci_handle_slot(struct ahci_port *p, int slot) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct pci_ahci_softc *sc; + uint8_t *cfis; + int cfl; + + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + cfl = (hdr->flags & 0x1f) * 4; + cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + +#ifdef AHCI_DEBUG + DPRINTF("\ncfis:"); + for (i = 0; i < cfl; i++) { + if (i % 10 == 0) + DPRINTF("\n"); + DPRINTF("%02x ", cfis[i]); + } + DPRINTF("\n"); + + for (i = 0; i < hdr->prdtl; i++) { + DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba); + prdt++; + } +#endif + + if (cfis[0] != FIS_TYPE_REGH2D) { + WPRINTF("Not a H2D FIS:%02x\n", cfis[0]); + return; + } + + if (cfis[1] & 0x80) { + ahci_handle_cmd(p, slot, cfis); + } else { + if (cfis[15] & (1 << 2)) + p->reset = 1; + else if (p->reset) { + p->reset = 0; + ahci_port_reset(p); + } + p->ci &= ~(1 << slot); + } +} + +static void +ahci_handle_port(struct ahci_port *p) +{ + int i; + + if (!(p->cmd & AHCI_P_CMD_ST)) + return; + + /* + * Search for any new commands to issue ignoring those that + * are already in-flight. + */ + for (i = 0; (i < 32) && p->ci; i++) { + if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) { + p->cmd &= ~AHCI_P_CMD_CCS_MASK; + p->cmd |= i << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, i); + } + } +} + +/* + * blockif callback routine - this runs in the context of the blockif + * i/o thread, so the mutex needs to be acquired. + */ +static void +ata_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint32_t tfd; + uint8_t *cfis; + int pending, slot, ncq; + + DPRINTF("%s %d\n", __func__, err); + + ncq = 0; + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + pending = aior->prdtl; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) + ncq = 1; + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (pending && !err) { + ahci_handle_dma(p, slot, cfis, aior->done, + hdr->prdtl - pending); + goto out; + } + + if (!err && aior->done == aior->len) { + tfd = ATA_S_READY | ATA_S_DSC; + if (ncq) + hdr->prdbc = 0; + else + hdr->prdbc = aior->len; + } else { + tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + hdr->prdbc = 0; + if (ncq) + p->serr |= (1 << slot); + } + + if (ncq) { + p->sact &= ~(1 << slot); + ahci_write_fis_sdb(p, slot, tfd); + } else + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +atapi_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint8_t *cfis; + uint32_t tfd; + int pending, slot; + + DPRINTF("%s %d\n", __func__, err); + + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + pending = aior->prdtl; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (pending && !err) { + atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending); + goto out; + } + + if (!err && aior->done == aior->len) { + tfd = ATA_S_READY | ATA_S_DSC; + hdr->prdbc = aior->len; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x21; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + hdr->prdbc = 0; + } + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +pci_ahci_ioreq_init(struct ahci_port *pr) +{ + struct ahci_ioreq *vr; + int i; + + pr->ioqsz = blockif_queuesz(pr->bctx); + pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); + STAILQ_INIT(&pr->iofhd); + + /* + * Add all i/o request entries to the free queue + */ + for (i = 0; i < pr->ioqsz; i++) { + vr = &pr->ioreq[i]; + vr->io_pr = pr; + if (!pr->atapi) + vr->io_req.br_callback = ata_ioreq_cb; + else + vr->io_req.br_callback = atapi_ioreq_cb; + vr->io_req.br_param = vr; + STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); + } + + TAILQ_INIT(&pr->iobhd); +} + +static void +pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + struct ahci_port *p = &sc->port[port]; + + DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + port, offset, value); + + switch (offset) { + case AHCI_P_CLB: + p->clb = value; + break; + case AHCI_P_CLBU: + p->clbu = value; + break; + case AHCI_P_FB: + p->fb = value; + break; + case AHCI_P_FBU: + p->fbu = value; + break; + case AHCI_P_IS: + p->is &= ~value; + break; + case AHCI_P_IE: + p->ie = value & 0xFDC000FF; + ahci_generate_intr(sc); + break; + case AHCI_P_CMD: + { + p->cmd = value; + + if (!(value & AHCI_P_CMD_ST)) { + ahci_port_stop(p); + } else { + uint64_t clb; + + p->cmd |= AHCI_P_CMD_CR; + clb = (uint64_t)p->clbu << 32 | p->clb; + p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, + AHCI_CL_SIZE * AHCI_MAX_SLOTS); + } + + if (value & AHCI_P_CMD_FRE) { + uint64_t fb; + + p->cmd |= AHCI_P_CMD_FR; + fb = (uint64_t)p->fbu << 32 | p->fb; + /* we don't support FBSCP, so rfis size is 256Bytes */ + p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); + } else { + p->cmd &= ~AHCI_P_CMD_FR; + } + + if (value & AHCI_P_CMD_CLO) { + p->tfd = 0; + p->cmd &= ~AHCI_P_CMD_CLO; + } + + ahci_handle_port(p); + break; + } + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_P_SCTL: + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (value & ATA_SC_DET_RESET) + ahci_port_reset(p); + p->sctl = value; + } + break; + case AHCI_P_SERR: + p->serr &= ~value; + break; + case AHCI_P_SACT: + p->sact |= value; + break; + case AHCI_P_CI: + p->ci |= value; + ahci_handle_port(p); + break; + case AHCI_P_SNTF: + case AHCI_P_FBS: + default: + break; + } +} + +static void +pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + offset, value); + + switch (offset) { + case AHCI_CAP: + case AHCI_PI: + case AHCI_VS: + case AHCI_CAP2: + DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_GHC: + if (value & AHCI_GHC_HR) + ahci_reset(sc); + else if (value & AHCI_GHC_IE) { + sc->ghc |= AHCI_GHC_IE; + ahci_generate_intr(sc); + } + break; + case AHCI_IS: + sc->is &= ~value; + ahci_generate_intr(sc); + break; + default: + break; + } +} + +static void +pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + + assert(baridx == 5); + assert(size == 4); + + pthread_mutex_lock(&sc->mtx); + + if (offset < AHCI_OFFSET) + pci_ahci_host_write(sc, offset, value); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + pci_ahci_port_write(sc, offset, value); + else + WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + + switch (offset) { + case AHCI_CAP: + case AHCI_GHC: + case AHCI_IS: + case AHCI_PI: + case AHCI_VS: + case AHCI_CCCC: + case AHCI_CCCP: + case AHCI_EM_LOC: + case AHCI_EM_CTL: + case AHCI_CAP2: + { + uint32_t *p = &sc->cap; + p += (offset - AHCI_CAP) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n", + offset, value); + + return (value); +} + +static uint64_t +pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + + switch (offset) { + case AHCI_P_CLB: + case AHCI_P_CLBU: + case AHCI_P_FB: + case AHCI_P_FBU: + case AHCI_P_IS: + case AHCI_P_IE: + case AHCI_P_CMD: + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + case AHCI_P_SCTL: + case AHCI_P_SERR: + case AHCI_P_SACT: + case AHCI_P_CI: + case AHCI_P_SNTF: + case AHCI_P_FBS: + { + uint32_t *p= &sc->port[port].clb; + p += (offset - AHCI_P_CLB) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + + DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n", + port, offset, value); + + return value; +} + +static uint64_t +pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + uint32_t value; + + assert(baridx == 5); + assert(size == 4); + + pthread_mutex_lock(&sc->mtx); + + if (offset < AHCI_OFFSET) + value = pci_ahci_host_read(sc, offset); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + value = pci_ahci_port_read(sc, offset); + else { + value = 0; + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset); + } + + pthread_mutex_unlock(&sc->mtx); + + return (value); +} + +static int +pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + struct pci_ahci_softc *sc; + int ret, slots; + + ret = 0; + + if (opts == NULL) { + fprintf(stderr, "pci_ahci: backing device required\n"); + return (1); + } + +#ifdef AHCI_DEBUG + dbg = fopen("/tmp/log", "w+"); +#endif + + sc = calloc(1, sizeof(struct pci_ahci_softc)); + pi->pi_arg = sc; + sc->asc_pi = pi; + sc->ports = MAX_PORTS; + + /* + * Only use port 0 for a backing device. All other ports will be + * marked as unused + */ + sc->port[0].atapi = atapi; + + /* + * Attempt to open the backing image. Use the PCI + * slot/func for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + ret = 1; + goto open_fail; + } + sc->port[0].bctx = bctxt; + sc->port[0].pr_sc = sc; + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[0]); + + pthread_mutex_init(&sc->mtx, NULL); + + /* Intel ICH8 AHCI */ + slots = sc->port[0].ioqsz; + if (slots > 32) + slots = 32; + --slots; + sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | + AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | + AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| + AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | + (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); + + /* Only port 0 implemented */ + sc->pi = 1; + sc->vs = 0x10300; + sc->cap2 = AHCI_CAP2_APST; + ahci_reset(sc); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); + pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); + pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, + AHCI_OFFSET + sc->ports * AHCI_STEP); + + pci_lintr_request(pi); + +open_fail: + if (ret) { + blockif_close(sc->port[0].bctx); + free(sc); + } + + return (ret); +} + +static int +pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 0)); +} + +static int +pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 1)); +} + +/* + * Use separate emulation names to distinguish drive and atapi devices + */ +struct pci_devemu pci_de_ahci_hd = { + .pe_emu = "ahci-hd", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_hd); + +struct pci_devemu pci_de_ahci_cd = { + .pe_emu = "ahci-cd", + .pe_init = pci_ahci_atapi_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c new file mode 100644 index 0000000000..3b4ca805cc --- /dev/null +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -0,0 +1,2103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/errno.h> + +#include <ctype.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <assert.h> +#include <stdbool.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "bhyverun.h" +#include "inout.h" +#include "ioapic.h" +#include "mem.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc + +#define CONF1_ENABLE 0x80000000ul + +#define CFGWRITE(pi,off,val,b) \ +do { \ + if ((b) == 1) { \ + pci_set_cfgdata8((pi),(off),(val)); \ + } else if ((b) == 2) { \ + pci_set_cfgdata16((pi),(off),(val)); \ + } else { \ + pci_set_cfgdata32((pi),(off),(val)); \ + } \ +} while (0) + +#define MAXBUSES (PCI_BUSMAX + 1) +#define MAXSLOTS (PCI_SLOTMAX + 1) +#define MAXFUNCS (PCI_FUNCMAX + 1) + +struct funcinfo { + char *fi_name; + char *fi_param; + struct pci_devinst *fi_devi; +}; + +struct intxinfo { + int ii_count; + int ii_pirq_pin; + int ii_ioapic_irq; +}; + +struct slotinfo { + struct intxinfo si_intpins[4]; + struct funcinfo si_funcs[MAXFUNCS]; +}; + +struct businfo { + uint16_t iobase, iolimit; /* I/O window */ + uint32_t membase32, memlimit32; /* mmio window below 4GB */ + uint64_t membase64, memlimit64; /* mmio window above 4GB */ + struct slotinfo slotinfo[MAXSLOTS]; +}; + +static struct businfo *pci_businfo[MAXBUSES]; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static struct pci_devemu *pci_emul_finddev(char *name); +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * <bus>:<slot>:<func>,<emul>[,<config>] + * <slot>[:<func>],<emul>[,<config>] + * + * slot is 0..31 + * func is 0..7 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3:0,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + + fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); +} + +int +pci_parse_slot(char *opt) +{ + struct businfo *bi; + struct slotinfo *si; + char *emul, *config, *str, *cp; + int error, bnum, snum, fnum; + + error = -1; + str = strdup(opt); + + emul = config = NULL; + if ((cp = strchr(str, ',')) != NULL) { + *cp = '\0'; + emul = cp + 1; + if ((cp = strchr(emul, ',')) != NULL) { + *cp = '\0'; + config = cp + 1; + } + } else { + pci_parse_slot_usage(opt); + goto done; + } + + /* <bus>:<slot>:<func> */ + if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { + bnum = 0; + /* <slot>:<func> */ + if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { + fnum = 0; + /* <slot> */ + if (sscanf(str, "%d", &snum) != 1) { + snum = -1; + } + } + } + + if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || + fnum < 0 || fnum >= MAXFUNCS) { + pci_parse_slot_usage(opt); + goto done; + } + + if (pci_businfo[bnum] == NULL) + pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); + + bi = pci_businfo[bnum]; + si = &bi->slotinfo[snum]; + + if (si->si_funcs[fnum].fi_name != NULL) { + fprintf(stderr, "pci slot %d:%d already occupied!\n", + snum, fnum); + goto done; + } + + if (pci_emul_finddev(emul) == NULL) { + fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", + snum, fnum, emul); + goto done; + } + + error = 0; + si->si_funcs[fnum].fi_name = emul; + si->si_funcs[fnum].fi_param = config; + +done: + if (error) + free(str); + + return (error); +} + +static int +pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) +{ + + if (offset < pi->pi_msix.pba_offset) + return (0); + + if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + return (0); + } + + return (1); +} + +int +pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value) +{ + int msix_entry_offset; + int tab_index; + char *dest; + + /* support only 4 or 8 byte writes */ + if (size != 4 && size != 8) + return (-1); + + /* + * Return if table index is beyond what device supports + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + if (tab_index >= pi->pi_msix.table_count) + return (-1); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned writes */ + if ((msix_entry_offset % size) != 0) + return (-1); + + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + *((uint32_t *)dest) = value; + else + *((uint64_t *)dest) = value; + + return (0); +} + +uint64_t +pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) +{ + char *dest; + int msix_entry_offset; + int tab_index; + uint64_t retval = ~0; + + /* + * The PCI standard only allows 4 and 8 byte accesses to the MSI-X + * table but we also allow 1 byte access to accomodate reads from + * ddb. + */ + if (size != 1 && size != 4 && size != 8) + return (retval); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned reads */ + if ((msix_entry_offset % size) != 0) { + return (retval); + } + + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + if (tab_index < pi->pi_msix.table_count) { + /* valid MSI-X Table access */ + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 1) + retval = *((uint8_t *)dest); + else if (size == 4) + retval = *((uint32_t *)dest); + else + retval = *((uint64_t *)dest); + } else if (pci_valid_pba_offset(pi, offset)) { + /* return 0 for PBA access */ + retval = 0; + } + + return (retval); +} + +int +pci_msix_table_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.table_bar); + else + return (-1); +} + +int +pci_msix_pba_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.pba_bar); + else + return (-1); +} + +static int +pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if (pdi->pi_bar[i].type == PCIBAR_IO && + port >= pdi->pi_bar[i].addr && + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + offset = port - pdi->pi_bar[i].addr; + if (in) + *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, + offset, bytes); + else + (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, + bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct pci_devinst *pdi = arg1; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(bidx <= PCI_BARMAX); + assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || + pdi->pi_bar[bidx].type == PCIBAR_MEM64); + assert(addr >= pdi->pi_bar[bidx].addr && + addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); + + offset = addr - pdi->pi_bar[bidx].addr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + 4, *val & 0xffffffff); + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, + 4, *val >> 32); + } else { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + size, *val); + } + } else { + if (size == 8) { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, 4); + *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset + 4, 4) << 32; + } else { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, size); + } + } + + return (0); +} + + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) +{ + + return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); +} + +/* + * Register (or unregister) the MMIO or I/O region associated with the BAR + * register 'idx' of an emulated pci device. + */ +static void +modify_bar_registration(struct pci_devinst *pi, int idx, int registration) +{ + int error; + struct inout_port iop; + struct mem_range mr; + + switch (pi->pi_bar[idx].type) { + case PCIBAR_IO: + bzero(&iop, sizeof(struct inout_port)); + iop.name = pi->pi_name; + iop.port = pi->pi_bar[idx].addr; + iop.size = pi->pi_bar[idx].size; + if (registration) { + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_io_handler; + iop.arg = pi; + error = register_inout(&iop); + } else + error = unregister_inout(&iop); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + bzero(&mr, sizeof(struct mem_range)); + mr.name = pi->pi_name; + mr.base = pi->pi_bar[idx].addr; + mr.size = pi->pi_bar[idx].size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = pci_emul_mem_handler; + mr.arg1 = pi; + mr.arg2 = idx; + error = register_mem(&mr); + } else + error = unregister_mem(&mr); + break; + default: + error = EINVAL; + break; + } + assert(error == 0); +} + +static void +unregister_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 0); +} + +static void +register_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 1); +} + +/* Are we decoding i/o port accesses for the emulated pci device? */ +static int +porten(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_PORTEN); +} + +/* Are we decoding memory accesses for the emulated pci device? */ +static int +memen(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_MEMEN); +} + +/* + * Update the MMIO or I/O address that is decoded by the BAR register. + * + * If the pci device has enabled the address space decoding then intercept + * the address range decoded by the BAR register. + */ +static void +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +{ + int decode; + + if (pi->pi_bar[idx].type == PCIBAR_IO) + decode = porten(pi); + else + decode = memen(pi); + + if (decode) + unregister_bar(pi, idx); + + switch (type) { + case PCIBAR_IO: + case PCIBAR_MEM32: + pi->pi_bar[idx].addr = addr; + break; + case PCIBAR_MEM64: + pi->pi_bar[idx].addr &= ~0xffffffffUL; + pi->pi_bar[idx].addr |= addr; + break; + case PCIBAR_MEMHI64: + pi->pi_bar[idx].addr &= 0xffffffff; + pi->pi_bar[idx].addr |= addr; + break; + default: + assert(0); + } + + if (decode) + register_bar(pi, idx); +} + +int +pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + int error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + if ((size & (size - 1)) != 0) + size = 1UL << flsl(size); /* round up to a power of 2 */ + + /* Enforce minimum BAR sizes required by the PCI standard */ + if (type == PCIBAR_IO) { + if (size < 4) + size = 4; + } else { + if (size < 16) + size = 16; + } + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } else { + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; + } + break; + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + default: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); + assert(0); + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + register_bar(pdi, idx); + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, reallen; + uint16_t sts; + + assert(caplen > 0); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) + capoff = CAP_START_OFFSET; + else + capoff = pi->pi_capend + 1; + + /* Check if we have enough space */ + if (capoff + reallen > PCI_REGMAX + 1) + return (-1); + + /* Set the previous capability pointer */ + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else + pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, 0); + + pi->pi_prevcap = capoff; + pi->pi_capend = capoff + reallen - 1; + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static int +pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, + int func, struct funcinfo *fi) +{ + struct pci_devinst *pdi; + int err; + + pdi = calloc(1, sizeof(struct pci_devinst)); + + pdi->pi_vmctx = ctx; + pdi->pi_bus = bus; + pdi->pi_slot = slot; + pdi->pi_func = func; + pthread_mutex_init(&pdi->pi_lintr.lock, NULL); + pdi->pi_lintr.pin = 0; + pdi->pi_lintr.state = IDLE; + pdi->pi_lintr.pirq_pin = 0; + pdi->pi_lintr.ioapic_irq = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + err = (*pde->pe_init)(ctx, pdi, fi->fi_param); + if (err == 0) + fi->fi_devi = pdi; + else + free(pdi); + + return (err); +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + CTASSERT(sizeof(struct msicap) == 14); + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = nextptr; + msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +static void +pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, + uint32_t msix_tab_size) +{ + CTASSERT(sizeof(struct msixcap) == 12); + + assert(msix_tab_size % 4096 == 0); + + bzero(msixcap, sizeof(struct msixcap)); + msixcap->capid = PCIY_MSIX; + + /* + * Message Control Register, all fields set to + * zero except for the Table Size. + * Note: Table size N is encoded as N-1 + */ + msixcap->msgctrl = msgnum - 1; + + /* + * MSI-X BAR setup: + * - MSI-X table start at offset 0 + * - PBA table starts at a 4K aligned offset after the MSI-X table + */ + msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; + msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); +} + +static void +pci_msix_table_init(struct pci_devinst *pi, int table_entries) +{ + int i, table_size; + + assert(table_entries > 0); + assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); + + table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* set mask bit of vector control register */ + for (i = 0; i < table_entries; i++) + pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; +} + +int +pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) +{ + uint32_t tab_size; + struct msixcap msixcap; + + assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); + assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); + + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; + + /* Align table size to nearest 4K */ + tab_size = roundup2(tab_size, 4096); + + pi->pi_msix.table_bar = barnum; + pi->pi_msix.pba_bar = barnum; + pi->pi_msix.table_offset = 0; + pi->pi_msix.table_count = msgnum; + pi->pi_msix.pba_offset = tab_size; + pi->pi_msix.pba_size = PBA_SIZE(msgnum); + + pci_msix_table_init(pi, msgnum); + + pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); + + /* allocate memory for MSI-X Table and PBA */ + pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, + tab_size + pi->pi_msix.pba_size); + + return (pci_emul_add_capability(pi, (u_char *)&msixcap, + sizeof(msixcap))); +} + +void +msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask; + int off, table_bar; + + off = offset - capoff; + table_bar = pi->pi_msix.table_bar; + /* Message Control Register */ + if (off == 2 && bytes == 2) { + rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.addr = addrlo; + pi->pi_msi.msg_data = msgdata; + pi->pi_msi.maxmsgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.maxmsgnum = 0; + } + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + + /* XXX don't write to the readonly parts */ + CFGWRITE(pi, offset, val, bytes); +} + +#define PCIECAP_VERSION 0x2 +int +pci_emul_add_pciecap(struct pci_devinst *pi, int type) +{ + int err; + struct pciecap pciecap; + + CTASSERT(sizeof(struct pciecap) == 60); + + if (type != PCIEM_TYPE_ROOT_PORT) + return (-1); + + bzero(&pciecap, sizeof(pciecap)); + + pciecap.capid = PCIY_EXPRESS; + pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT; + pciecap.link_capabilities = 0x411; /* gen1, x1 */ + pciecap.link_status = 0x11; /* gen1, x1 */ + + err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); + return (err); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly. + * However, some o/s's do 4-byte writes that include these. + * For this case, trim the write back to 2 bytes and adjust + * the data. + */ + if (offset == capoff || offset == capoff + 1) { + if (offset == capoff && bytes == 4) { + bytes = 2; + offset += 2; + val >>= 16; + } else + return; + } + + capid = pci_get_cfgdata8(pi, capoff); + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_MSIX: + msixcap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_EXPRESS: + pciecap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + uint16_t sts; + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) + return (1); + } + return (0); +} + +static int +pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + /* + * Ignore writes; return 0xff's for reads. The mem read code + * will take care of truncating to the correct size. + */ + if (dir == MEM_F_READ) { + *val = 0xffffffffffffffff; + } + + return (0); +} + +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + +#define BUSIO_ROUNDUP 32 +#define BUSMEM_ROUNDUP (1024 * 1024) + +int +init_pci(struct vmctx *ctx) +{ + struct mem_range mr; + struct pci_devemu *pde; + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + size_t lowmem; + int bus, slot, func; + int error; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = vm_get_lowmem_limit(ctx); + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + /* + * Keep track of the i/o and memory resources allocated to + * this bus. + */ + bi->iobase = pci_emul_iobase; + bi->membase32 = pci_emul_membase32; + bi->membase64 = pci_emul_membase64; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + pde = pci_emul_finddev(fi->fi_name); + assert(pde != NULL); + error = pci_emul_init(ctx, pde, bus, slot, + func, fi); + if (error) + return (error); + } + } + + /* + * Add some slop to the I/O and memory resources decoded by + * this bus to give a guest some flexibility if it wants to + * reprogram the BARs. + */ + pci_emul_iobase += BUSIO_ROUNDUP; + pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); + bi->iolimit = pci_emul_iobase; + + pci_emul_membase32 += BUSMEM_ROUNDUP; + pci_emul_membase32 = roundup2(pci_emul_membase32, + BUSMEM_ROUNDUP); + bi->memlimit32 = pci_emul_membase32; + + pci_emul_membase64 += BUSMEM_ROUNDUP; + pci_emul_membase64 = roundup2(pci_emul_membase64, + BUSMEM_ROUNDUP); + bi->memlimit64 = pci_emul_membase64; + } + + /* + * PCI backends are initialized before routing INTx interrupts + * so that LPC devices are able to reserve ISA IRQs before + * routing PIRQ pins. + */ + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_devi == NULL) + continue; + pci_lintr_route(fi->fi_devi); + } + } + } + lpc_pirq_routed(); + + /* + * The guest physical memory map looks like the following: + * [0, lowmem) guest system memory + * [lowmem, lowmem_limit) memory hole (may be absent) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware + * [4GB, 4GB + highmem) + */ + + /* + * Accesses to memory addresses that are not allocated to system + * memory or PCI devices return 0xff's. + */ + lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); + + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); + assert(error == 0); + + return (0); +} + +#ifdef __FreeBSD__ +static void +pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" Zero,"); + dsdt_line(" 0x%X", ioapic_irq); + dsdt_line(" },"); +} + +static void +pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + char *name; + + name = lpc_pirq_name(pirq_pin); + if (name == NULL) + return; + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" %s,", name); + dsdt_line(" 0x00"); + dsdt_line(" },"); + free(name); +} + +/* + * A bhyve virtual machine has a flat PCI hierarchy with a root port + * corresponding to each PCI bus. + */ +static void +pci_bus_write_dsdt(int bus) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + int count, func, slot; + + /* + * If there are no devices on this 'bus' then just return. + */ + if ((bi = pci_businfo[bus]) == NULL) { + /* + * Bus 0 is special because it decodes the I/O ports used + * for PCI config space access even if there are no devices + * on it. + */ + if (bus != 0) + return; + } + + dsdt_line(" Device (PC%02X)", bus); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); + dsdt_line(" Name (_ADR, Zero)"); + + dsdt_line(" Method (_BBN, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Return (0x%08X)", bus); + dsdt_line(" }"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " + "MaxFixed, PosDecode,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bus); + dsdt_line(" 0x%04X, // Range Maximum", bus); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0001, // Length"); + dsdt_line(" ,, )"); + + if (bus == 0) { + dsdt_indent(3); + dsdt_fixed_ioport(0xCF8, 8); + dsdt_unindent(3); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0000, // Range Minimum"); + dsdt_line(" 0x0CF7, // Range Maximum"); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0CF8, // Length"); + dsdt_line(" ,, , TypeStatic)"); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0D00, // Range Minimum"); + dsdt_line(" 0x%04X, // Range Maximum", + PCI_EMUL_IOBASE - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + PCI_EMUL_IOBASE - 0x0D00); + dsdt_line(" ,, , TypeStatic)"); + + if (bi == NULL) { + dsdt_line(" })"); + goto done; + } + } + assert(bi != NULL); + + /* i/o window */ + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); + dsdt_line(" 0x%04X, // Range Maximum", + bi->iolimit - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + bi->iolimit - bi->iobase); + dsdt_line(" ,, , TypeStatic)"); + + /* mmio window (32-bit) */ + dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x00000000, // Granularity"); + dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); + dsdt_line(" 0x%08X, // Range Maximum\n", + bi->memlimit32 - 1); + dsdt_line(" 0x00000000, // Translation Offset"); + dsdt_line(" 0x%08X, // Length\n", + bi->memlimit32 - bi->membase32); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + + /* mmio window (64-bit) */ + dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x0000000000000000, // Granularity"); + dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); + dsdt_line(" 0x%016lX, // Range Maximum\n", + bi->memlimit64 - 1); + dsdt_line(" 0x0000000000000000, // Translation Offset"); + dsdt_line(" 0x%016lX, // Length\n", + bi->memlimit64 - bi->membase64); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + dsdt_line(" })"); + + count = pci_count_lintr(bus); + if (count != 0) { + dsdt_indent(2); + dsdt_line("Name (PPRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Name (APRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_apic_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Method (_PRT, 0, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (PICM)"); + dsdt_line(" {"); + dsdt_line(" Return (APRT)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (PPRT)"); + dsdt_line(" }"); + dsdt_line("}"); + dsdt_unindent(2); + } + + dsdt_indent(2); + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + pi = si->si_funcs[func].fi_devi; + if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) + pi->pi_d->pe_write_dsdt(pi); + } + } + dsdt_unindent(2); +done: + dsdt_line(" }"); +} + +void +pci_write_dsdt(void) +{ + int bus; + + dsdt_indent(1); + dsdt_line("Name (PICM, 0x00)"); + dsdt_line("Method (_PIC, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" Store (Arg0, PICM)"); + dsdt_line("}"); + dsdt_line(""); + dsdt_line("Scope (_SB)"); + dsdt_line("{"); + for (bus = 0; bus < MAXBUSES; bus++) + pci_bus_write_dsdt(bus); + dsdt_line("}"); + dsdt_unindent(1); +} +#endif + +int +pci_bus_configured(int bus) +{ + assert(bus >= 0 && bus < MAXBUSES); + return (pci_businfo[bus] != NULL); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +int +pci_msi_maxmsgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.maxmsgnum); + else + return (0); +} + +int +pci_msix_enabled(struct pci_devinst *pi) +{ + + return (pi->pi_msix.enabled && !pi->pi_msi.enabled); +} + +void +pci_generate_msix(struct pci_devinst *pi, int index) +{ + struct msix_table_entry *mte; + + if (!pci_msix_enabled(pi)) + return; + + if (pi->pi_msix.function_mask) + return; + + if (index >= pi->pi_msix.table_count) + return; + + mte = &pi->pi_msix.table[index]; + if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* XXX Set PBA bit if interrupt is disabled */ + vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); + } +} + +void +pci_generate_msi(struct pci_devinst *pi, int index) +{ + + if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { + vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, + pi->pi_msi.msg_data + index); + } +} + +static bool +pci_lintr_permitted(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || + (cmd & PCIM_CMD_INTxDIS))); +} + +void +pci_lintr_request(struct pci_devinst *pi) +{ + struct businfo *bi; + struct slotinfo *si; + int bestpin, bestcount, pin; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + + /* + * Just allocate a pin from our slot. The pin will be + * assigned IRQs later when interrupts are routed. + */ + si = &bi->slotinfo[pi->pi_slot]; + bestpin = 0; + bestcount = si->si_intpins[0].ii_count; + for (pin = 1; pin < 4; pin++) { + if (si->si_intpins[pin].ii_count < bestcount) { + bestpin = pin; + bestcount = si->si_intpins[pin].ii_count; + } + } + + si->si_intpins[bestpin].ii_count++; + pi->pi_lintr.pin = bestpin + 1; + pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); +} + +static void +pci_lintr_route(struct pci_devinst *pi) +{ + struct businfo *bi; + struct intxinfo *ii; + + if (pi->pi_lintr.pin == 0) + return; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; + + /* + * Attempt to allocate an I/O APIC pin for this intpin if one + * is not yet assigned. + */ + if (ii->ii_ioapic_irq == 0) + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); + assert(ii->ii_ioapic_irq > 0); + + /* + * Attempt to allocate a PIRQ pin for this intpin if one is + * not yet assigned. + */ + if (ii->ii_pirq_pin == 0) + ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx); + assert(ii->ii_pirq_pin > 0); + + pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; + pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; + pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); +} + +void +pci_lintr_assert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == IDLE) { + if (pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } else + pi->pi_lintr.state = PENDING; + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +void +pci_lintr_deassert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED) { + pi->pi_lintr.state = IDLE; + pci_irq_deassert(pi); + } else if (pi->pi_lintr.state == PENDING) + pi->pi_lintr.state = IDLE; + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +static void +pci_lintr_update(struct pci_devinst *pi) +{ + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { + pci_irq_deassert(pi); + pi->pi_lintr.state = PENDING; + } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +int +pci_count_lintr(int bus) +{ + int count, slot, pin; + struct slotinfo *slotinfo; + + count = 0; + if (pci_businfo[bus] != NULL) { + for (slot = 0; slot < MAXSLOTS; slot++) { + slotinfo = &pci_businfo[bus]->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + if (slotinfo->si_intpins[pin].ii_count != 0) + count++; + } + } + } + return (count); +} + +void +pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) +{ + struct businfo *bi; + struct slotinfo *si; + struct intxinfo *ii; + int slot, pin; + + if ((bi = pci_businfo[bus]) == NULL) + return; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + ii = &si->si_intpins[pin]; + if (ii->ii_count != 0) + cb(bus, slot, pin + 1, ii->ii_pirq_pin, + ii->ii_ioapic_irq, arg); + } + } +} + +/* + * Return 1 if the emulated device in 'slot' is a multi-function device. + * Return 0 otherwise. + */ +static int +pci_emul_is_mfdev(int bus, int slot) +{ + struct businfo *bi; + struct slotinfo *si; + int f, numfuncs; + + numfuncs = 0; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + for (f = 0; f < MAXFUNCS; f++) { + if (si->si_funcs[f].fi_devi != NULL) { + numfuncs++; + } + } + } + return (numfuncs > 1); +} + +/* + * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on + * whether or not is a multi-function being emulated in the pci 'slot'. + */ +static void +pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) +{ + int mfdev; + + if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { + mfdev = pci_emul_is_mfdev(bus, slot); + switch (bytes) { + case 1: + case 2: + *rv &= ~PCIM_MFDEV; + if (mfdev) { + *rv |= PCIM_MFDEV; + } + break; + case 4: + *rv &= ~(PCIM_MFDEV << 16); + if (mfdev) { + *rv |= (PCIM_MFDEV << 16); + } + break; + } + } +} + +static uint32_t +bits_changed(uint32_t old, uint32_t new, uint32_t mask) +{ + + return ((old ^ new) & mask); +} + +static void +pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) +{ + int i; + uint16_t old; + + /* + * The command register is at an offset of 4 bytes and thus the + * guest could write 1, 2 or 4 bytes starting at this offset. + */ + + old = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ + CFGWRITE(pi, PCIR_COMMAND, new, bytes); /* update config */ + new = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + + /* + * If the MMIO or I/O address space decoding has changed then + * register/unregister all BARs that decode that address space. + */ + for (i = 0; i <= PCI_BARMAX; i++) { + switch (pi->pi_bar[i].type) { + case PCIBAR_NONE: + case PCIBAR_MEMHI64: + break; + case PCIBAR_IO: + /* I/O address space decoding changed? */ + if (bits_changed(old, new, PCIM_CMD_PORTEN)) { + if (porten(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + /* MMIO address space decoding changed? */ + if (bits_changed(old, new, PCIM_CMD_MEMEN)) { + if (memen(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + default: + assert(0); + } + } + + /* + * If INTx has been unmasked and is pending, assert the + * interrupt. + */ + pci_lintr_update(pi); +} + +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + struct pci_devemu *pe; + int idx, needcfg; + uint64_t addr, bar, mask; + + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; + } else + pi = NULL; + + /* + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. + */ + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { + if (in) + *eax = 0xffffffff; + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL) { + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); + } else { + needcfg = 1; + } + + if (needcfg) { + if (bytes == 1) + *eax = pci_get_cfgdata8(pi, coff); + else if (bytes == 2) + *eax = pci_get_cfgdata16(pi, coff); + else + *eax = pci_get_cfgdata32(pi, coff); + } + + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) + return; + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return; + idx = (coff - PCIR_BAR(0)) / 4; + mask = ~(pi->pi_bar[idx].size - 1); + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + pi->pi_bar[idx].addr = bar = 0; + break; + case PCIBAR_IO: + addr = *eax & mask; + addr &= 0xffff; + bar = addr | PCIM_BAR_IO_SPACE; + /* + * Register the new BAR value for interception + */ + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_IO); + } + break; + case PCIBAR_MEM32: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM32); + } + break; + case PCIBAR_MEM64: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + if (addr != (uint32_t)pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM64); + } + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + addr = ((uint64_t)*eax << 32) & mask; + bar = addr >> 32; + if (bar != pi->pi_bar[idx - 1].addr >> 32) { + update_bar_address(pi, addr, idx - 1, + PCIBAR_MEMHI64); + } + break; + default: + assert(0); + } + pci_set_cfgdata32(pi, coff, bar); + + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else if (coff == PCIR_COMMAND) { + pci_emul_cmdwrite(pi, *eax, bytes); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DIOSZ 8 +#define DMEMSZ 4096 +struct pci_emul_dsoftc { + uint8_t ioregs[DIOSZ]; + uint8_t memregs[DMEMSZ]; +}; + +#define PCI_EMUL_MSI_MSGS 4 +#define PCI_EMUL_MSIX_MSGS 16 + +static int +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = calloc(1, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + return (0); +} + +static void +pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("diow: iow too large, offset %ld size %d\n", + offset, size); + return; + } + + if (size == 1) { + sc->ioregs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; + } else if (size == 4) { + *(uint32_t *)&sc->ioregs[offset] = value; + } else { + printf("diow: iow unknown size %d\n", size); + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_maxmsgnum(pi); i++) + pci_generate_msi(pi, i); + } + } + + if (baridx == 1) { + if (offset + size > DMEMSZ) { + printf("diow: memw too large, offset %ld size %d\n", + offset, size); + return; + } + + if (size == 1) { + sc->memregs[offset] = value; + } else if (size == 2) { + *(uint16_t *)&sc->memregs[offset] = value; + } else if (size == 4) { + *(uint32_t *)&sc->memregs[offset] = value; + } else if (size == 8) { + *(uint64_t *)&sc->memregs[offset] = value; + } else { + printf("diow: memw unknown size %d\n", size); + } + + /* + * magic interrupt ?? + */ + } + + if (baridx > 1) { + printf("diow: unknown bar idx %d\n", baridx); + } +} + +static uint64_t +pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("dior: ior too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->ioregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->ioregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->ioregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + if (baridx == 1) { + if (offset + size > DMEMSZ) { + printf("dior: memr too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->memregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->memregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->memregs[offset]; + } else if (size == 8) { + value = *(uint64_t *) &sc->memregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + + if (baridx > 1) { + printf("dior: unknown bar idx %d\n", baridx); + return (0); + } + + return (value); +} + +struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_barwrite = pci_emul_diow, + .pe_barread = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h new file mode 100644 index 0000000000..6af01c4c3c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -0,0 +1,283 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $ + */ + +#ifndef _PCI_EMUL_H_ +#define _PCI_EMUL_H_ + +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/_pthreadtypes.h> + +#include <dev/pci/pcireg.h> + +#include <assert.h> + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ + +struct vmctx; +struct pci_devinst; +struct memory_region; + +struct pci_devemu { + char *pe_emu; /* Name of device emulation */ + + /* instance creation */ + int (*pe_init)(struct vmctx *, struct pci_devinst *, + char *opts); + + /* ACPI DSDT enumeration */ + void (*pe_write_dsdt)(struct pci_devinst *); + + /* config space read/write callbacks */ + int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t val); + int (*pe_cfgread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t *retval); + + /* BAR read/write callbacks */ + void (*pe_barwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value); + uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size); +}; +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; + +#define PI_NAMESZ 40 + +struct msix_table_entry { + uint64_t addr; + uint32_t msg_data; + uint32_t vector_control; +} __packed; + +/* + * In case the structure is modified to hold extra information, use a define + * for the size that should be emulated. + */ +#define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +struct pci_devinst { + struct pci_devemu *pi_d; + struct vmctx *pi_vmctx; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + int pi_bar_getsize; + int pi_prevcap; + int pi_capend; + + struct { + int8_t pin; + enum lintr_stat state; + int pirq_pin; + int ioapic_irq; + pthread_mutex_t lock; + } pi_lintr; + + struct { + int enabled; + uint64_t addr; + uint64_t msg_data; + int maxmsgnum; + } pi_msi; + + struct { + int enabled; + int table_bar; + int pba_bar; + uint32_t table_offset; + int table_count; + uint32_t pba_offset; + int pba_size; + int function_mask; + struct msix_table_entry *table; /* allocated at runtime */ + } pi_msix; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; + +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; + +struct msixcap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t table_info; /* bar index and offset within it */ + uint32_t pba_info; /* bar index and offset within it */ +} __packed; + +struct pciecap { + uint8_t capid; + uint8_t nextptr; + uint16_t pcie_capabilities; + + uint32_t dev_capabilities; /* all devices */ + uint16_t dev_control; + uint16_t dev_status; + + uint32_t link_capabilities; /* devices with links */ + uint16_t link_control; + uint16_t link_status; + + uint32_t slot_capabilities; /* ports with slots */ + uint16_t slot_control; + uint16_t slot_status; + + uint16_t root_control; /* root ports */ + uint16_t root_capabilities; + uint32_t root_status; + + uint32_t dev_capabilities2; /* all devices */ + uint16_t dev_control2; + uint16_t dev_status2; + + uint32_t link_capabilities2; /* devices with links */ + uint16_t link_control2; + uint16_t link_status2; + + uint32_t slot_capabilities2; /* ports with slots */ + uint16_t slot_control2; + uint16_t slot_status2; +} __packed; + +typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, + int ioapic_irq, void *arg); + +int init_pci(struct vmctx *ctx); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, + enum pcibar_type type, uint64_t size); +int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, + uint64_t hostbase, enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +void pci_generate_msix(struct pci_devinst *pi, int msgnum); +void pci_lintr_assert(struct pci_devinst *pi); +void pci_lintr_deassert(struct pci_devinst *pi); +void pci_lintr_request(struct pci_devinst *pi); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msix_enabled(struct pci_devinst *pi); +int pci_msix_table_bar(struct pci_devinst *pi); +int pci_msix_pba_bar(struct pci_devinst *pi); +int pci_msi_msgnum(struct pci_devinst *pi); +int pci_parse_slot(char *opt); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); +int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); +int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value); +uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); +int pci_count_lintr(int bus); +void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); +void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); +int pci_bus_configured(int bus); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(pi->pi_cfgdata + offset)); +} + +#endif /* _PCI_EMUL_H_ */ diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c new file mode 100644 index 0000000000..08956d082e --- /dev/null +++ b/usr/src/cmd/bhyve/pci_hostbridge.c @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $"); + +#include "pci_emul.h" + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); + + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + (void) pci_hostbridge_init(ctx, pi, opts); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + + return (0); +} + +struct pci_devemu pci_de_amd_hostbridge = { + .pe_emu = "amd_hostbridge", + .pe_init = pci_amd_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_amd_hostbridge); + +struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c new file mode 100644 index 0000000000..97ee330c65 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_irq.c @@ -0,0 +1,351 @@ +/*- + * Copyright (c) 2014 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $"); + +#include <sys/param.h> +#include <machine/vmm.h> + +#include <assert.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +/* + * Implement an 8 pin PCI interrupt router compatible with the router + * present on Intel's ICH10 chip. + */ + +/* Fields in each PIRQ register. */ +#define PIRQ_DIS 0x80 +#define PIRQ_IRQ 0x0f + +/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */ +#define PERMITTED_IRQS 0xdef8 +#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0) + +/* IRQ count to disable an IRQ. */ +#define IRQ_DISABLED 0xff + +static struct pirq { + uint8_t reg; + int use_count; + int active_count; + pthread_mutex_t lock; +} pirqs[8]; + +static u_char irq_counts[16]; +static int pirq_cold = 1; + +/* + * Returns true if this pin is enabled with a valid IRQ. Setting the + * register to a reserved IRQ causes interrupts to not be asserted as + * if the pin was disabled. + */ +static bool +pirq_valid_irq(int reg) +{ + + if (reg & PIRQ_DIS) + return (false); + return (IRQ_PERMITTED(reg & PIRQ_IRQ)); +} + +uint8_t +pirq_read(int pin) +{ + + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg); +} + +void +pirq_write(struct vmctx *ctx, int pin, uint8_t val) +{ + struct pirq *pirq; + + assert(pin > 0 && pin <= nitems(pirqs)); + pirq = &pirqs[pin - 1]; + pthread_mutex_lock(&pirq->lock); + if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) { + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ); + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + } + pthread_mutex_unlock(&pirq->lock); +} + +void +pci_irq_reserve(int irq) +{ + + assert(irq < nitems(irq_counts)); + assert(pirq_cold); + assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); + irq_counts[irq] = IRQ_DISABLED; +} + +void +pci_irq_use(int irq) +{ + + assert(irq < nitems(irq_counts)); + assert(pirq_cold); + if (irq_counts[irq] != IRQ_DISABLED) + irq_counts[irq]++; +} + +void +pci_irq_init(struct vmctx *ctx) +{ + int i; + + for (i = 0; i < nitems(pirqs); i++) { + pirqs[i].reg = PIRQ_DIS; + pirqs[i].use_count = 0; + pirqs[i].active_count = 0; + pthread_mutex_init(&pirqs[i].lock, NULL); + } + for (i = 0; i < nitems(irq_counts); i++) { + if (IRQ_PERMITTED(i)) + irq_counts[i] = 0; + else + irq_counts[i] = IRQ_DISABLED; + } +} + +void +pci_irq_assert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count++; + if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) { + vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +void +pci_irq_deassert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count--; + if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) { + vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +int +pirq_alloc_pin(struct vmctx *ctx) +{ + int best_count, best_irq, best_pin, irq, pin; + + pirq_cold = 1; + + /* First, find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; pin < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } + } + pirqs[best_pin].use_count++; + + /* Second, route this pin to an IRQ. */ + if (pirqs[best_pin].reg == PIRQ_DIS) { + best_irq = -1; + best_count = 0; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (irq_counts[irq] == IRQ_DISABLED) + continue; + if (best_irq == -1 || irq_counts[irq] < best_count) { + best_irq = irq; + best_count = irq_counts[irq]; + } + } + assert(best_irq != 0); + irq_counts[best_irq]++; + pirqs[best_pin].reg = best_irq; + vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); + } + + return (best_pin + 1); +} + +int +pirq_irq(int pin) +{ + + if (pin == -1) + return (255); + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg & PIRQ_IRQ); +} + +/* XXX: Generate $PIR table. */ + +#ifdef __FreeBSD__ +static void +pirq_dsdt(void) +{ + char *irq_prs, *old; + int irq, pin; + + irq_prs = NULL; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (!IRQ_PERMITTED(irq)) + continue; + if (irq_prs == NULL) + asprintf(&irq_prs, "%d", irq); + else { + old = irq_prs; + asprintf(&irq_prs, "%s,%d", old, irq); + free(old); + } + } + + /* + * A helper method to validate a link register's value. This + * duplicates pirq_valid_irq(). + */ + dsdt_line(""); + dsdt_line("Method (PIRV, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ); + dsdt_line(" If (LLess (Local0, 0x03))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x08))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x0D))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" Return (0x01)"); + dsdt_line("}"); + + for (pin = 0; pin < nitems(pirqs); pin++) { + dsdt_line(""); + dsdt_line("Device (LNK%c)", 'A' + pin); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))"); + dsdt_line(" Name (_UID, 0x%02X)", pin + 1); + dsdt_line(" Method (_STA, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" If (PIRV (PIR%c))", 'A' + pin); + dsdt_line(" {"); + dsdt_line(" Return (0x0B)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (0x09)"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line(" Name (_PRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {%s}", irq_prs); + dsdt_line(" })"); + dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {}"); + dsdt_line(" })"); + dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)", + pin + 1, 'A' + pin); + dsdt_line(" Method (_CRS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin, + PIRQ_DIS | PIRQ_IRQ); + dsdt_line(" If (PIRV (Local0))"); + dsdt_line(" {"); + dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Store (0x00, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Return (CB%02X)", pin + 1); + dsdt_line(" }"); + dsdt_line(" Method (_DIS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Store (0x80, PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Method (_SRS, 1, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin); + dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin); + dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line("}"); + } + free(irq_prs); +} +LPC_DSDT(pirq_dsdt); +#endif diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h new file mode 100644 index 0000000000..483f12b61e --- /dev/null +++ b/usr/src/cmd/bhyve/pci_irq.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2014 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $ + */ + +#ifndef __PCI_IRQ_H__ +#define __PCI_IRQ_H__ + +struct pci_devinst; + +void pci_irq_assert(struct pci_devinst *pi); +void pci_irq_deassert(struct pci_devinst *pi); +void pci_irq_init(struct vmctx *ctx); +void pci_irq_reserve(int irq); +void pci_irq_use(int irq); +int pirq_alloc_pin(struct vmctx *ctx); +int pirq_irq(int pin); +uint8_t pirq_read(int pin); +void pirq_write(struct vmctx *ctx, int pin, uint8_t val); + +#endif diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c new file mode 100644 index 0000000000..8c060150dc --- /dev/null +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -0,0 +1,433 @@ +/*- + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $"); + +#include <sys/types.h> +#include <machine/vmm.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "uart_emul.h" + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 + +SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); +SET_DECLARE(lpc_sysres_set, struct lpc_sysres); + +#define ELCR_PORT 0x4d0 +SYSRES_IO(ELCR_PORT, 2); + +#define IO_TIMER1_PORT 0x40 + +#define NMISC_PORT 0x61 +SYSRES_IO(NMISC_PORT, 1); + +static struct pci_devinst *lpc_bridge; + +#define LPC_UART_NUM 2 +static struct lpc_uart_softc { + struct uart_softc *uart_softc; + const char *opts; + int iobase; + int irq; + int enabled; +} lpc_uart_softc[LPC_UART_NUM]; + +static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; + +/* + * LPC device configuration is in the following form: + * <lpc_device_name>[,<options>] + * For e.g. "com1,stdio" + */ +int +lpc_device_parse(const char *opts) +{ + int unit, error; + char *str, *cpy, *lpcdev; + + error = -1; + str = cpy = strdup(opts); + lpcdev = strsep(&str, ","); + if (lpcdev != NULL) { + for (unit = 0; unit < LPC_UART_NUM; unit++) { + if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { + lpc_uart_softc[unit].opts = str; + error = 0; + goto done; + } + } + } + +done: + if (error) + free(cpy); + + return (error); +} + +static void +lpc_uart_intr_assert(void *arg) +{ + struct lpc_uart_softc *sc = arg; + + assert(sc->irq >= 0); + + vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); +} + +static void +lpc_uart_intr_deassert(void *arg) +{ + /* + * The COM devices on the LPC bus generate edge triggered interrupts, + * so nothing more to do here. + */ +} + +static int +lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int offset; + struct lpc_uart_softc *sc = arg; + + offset = port - sc->iobase; + + switch (bytes) { + case 1: + if (in) + *eax = uart_read(sc->uart_softc, offset); + else + uart_write(sc->uart_softc, offset, *eax); + break; + case 2: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + } + break; + default: + return (-1); + } + + return (0); +} + +static int +lpc_init(void) +{ + struct lpc_uart_softc *sc; + struct inout_port iop; + const char *name; + int unit, error; + + /* COM1 and COM2 */ + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + name = lpc_uart_names[unit]; + + if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { + fprintf(stderr, "Unable to allocate resources for " + "LPC device %s\n", name); + return (-1); + } + pci_irq_reserve(sc->irq); + + sc->uart_softc = uart_init(lpc_uart_intr_assert, + lpc_uart_intr_deassert, sc); + + if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' " + "for LPC device %s\n", sc->opts, name); + return (-1); + } + + bzero(&iop, sizeof(struct inout_port)); + iop.name = name; + iop.port = sc->iobase; + iop.size = UART_IO_BAR_SIZE; + iop.flags = IOPORT_F_INOUT; + iop.handler = lpc_uart_io_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + sc->enabled = 1; + } + + return (0); +} + +#ifdef __FreeBSD__ +static void +pci_lpc_write_dsdt(struct pci_devinst *pi) +{ + struct lpc_dsdt **ldpp, *ldp; + + dsdt_line(""); + dsdt_line("Device (ISA)"); + dsdt_line("{"); + dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); + dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); + dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); + dsdt_line(" {"); + dsdt_line(" Offset (0x60),"); + dsdt_line(" PIRA, 8,"); + dsdt_line(" PIRB, 8,"); + dsdt_line(" PIRC, 8,"); + dsdt_line(" PIRD, 8,"); + dsdt_line(" Offset (0x68),"); + dsdt_line(" PIRE, 8,"); + dsdt_line(" PIRF, 8,"); + dsdt_line(" PIRG, 8,"); + dsdt_line(" PIRH, 8"); + dsdt_line(" }"); + dsdt_line(""); + + dsdt_indent(1); + SET_FOREACH(ldpp, lpc_dsdt_set) { + ldp = *ldpp; + ldp->handler(); + } + + dsdt_line(""); + dsdt_line("Device (PIC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_ICU1, 2); + dsdt_fixed_ioport(IO_ICU2, 2); + dsdt_fixed_irq(2); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (TIMR)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_TIMER1_PORT, 4); + dsdt_fixed_irq(0); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + dsdt_unindent(1); + + dsdt_line("}"); +} + +static void +pci_lpc_sysres_dsdt(void) +{ + struct lpc_sysres **lspp, *lsp; + + dsdt_line(""); + dsdt_line("Device (SIO)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + + dsdt_indent(2); + SET_FOREACH(lspp, lpc_sysres_set) { + lsp = *lspp; + switch (lsp->type) { + case LPC_SYSRES_IO: + dsdt_fixed_ioport(lsp->base, lsp->length); + break; + case LPC_SYSRES_MEM: + dsdt_fixed_mem32(lsp->base, lsp->length); + break; + } + } + dsdt_unindent(2); + + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(pci_lpc_sysres_dsdt); + +static void +pci_lpc_uart_dsdt(void) +{ + struct lpc_uart_softc *sc; + int unit; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + if (!sc->enabled) + continue; + dsdt_line(""); + dsdt_line("Device (%s)", lpc_uart_names[unit]); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); + dsdt_line(" Name (_UID, %d)", unit + 1); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); + dsdt_fixed_irq(sc->irq); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + } +} +LPC_DSDT(pci_lpc_uart_dsdt); +#endif + +static int +pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int pirq_pin; + + if (bytes == 1) { + pirq_pin = 0; + if (coff >= 0x60 && coff <= 0x63) + pirq_pin = coff - 0x60 + 1; + if (coff >= 0x68 && coff <= 0x6b) + pirq_pin = coff - 0x68 + 5; + if (pirq_pin != 0) { + pirq_write(ctx, pirq_pin, val); + pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); + return (0); + } + } + return (-1); +} + +static void +pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ +} + +static uint64_t +pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + return (0); +} + +#define LPC_DEV 0x7000 +#define LPC_VENDOR 0x8086 + +static int +pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* + * Do not allow more than one LPC bridge to be configured. + */ + if (lpc_bridge != NULL) { + fprintf(stderr, "Only one LPC bridge is allowed.\n"); + return (-1); + } + + /* + * Enforce that the LPC can only be configured on bus 0. This + * simplifies the ACPI DSDT because it can provide a decode for + * all legacy i/o ports behind bus 0. + */ + if (pi->pi_bus != 0) { + fprintf(stderr, "LPC bridge can be present only on bus 0.\n"); + return (-1); + } + + if (lpc_init() != 0) + return (-1); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + + lpc_bridge = pi; + + return (0); +} + +char * +lpc_pirq_name(int pin) +{ + char *name; + + if (lpc_bridge == NULL) + return (NULL); + asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); + return (name); +} + +void +lpc_pirq_routed(void) +{ + int pin; + + if (lpc_bridge == NULL) + return; + + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); +} + +struct pci_devemu pci_de_lpc = { + .pe_emu = "lpc", + .pe_init = pci_lpc_init, +#ifdef __FreeBSD__ + .pe_write_dsdt = pci_lpc_write_dsdt, +#endif + .pe_cfgwrite = pci_lpc_cfgwrite, + .pe_barwrite = pci_lpc_write, + .pe_barread = pci_lpc_read +}; +PCI_EMUL_SET(pci_de_lpc); diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h new file mode 100644 index 0000000000..4f725b1dd3 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_lpc.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $ + */ + +#ifndef _LPC_H_ +#define _LPC_H_ + +#include <sys/linker_set.h> + +typedef void (*lpc_write_dsdt_t)(void); + +struct lpc_dsdt { + lpc_write_dsdt_t handler; +}; + +#define LPC_DSDT(handler) \ + static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \ + (handler), \ + }; \ + DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__)) + +enum lpc_sysres_type { + LPC_SYSRES_IO, + LPC_SYSRES_MEM +}; + +struct lpc_sysres { + enum lpc_sysres_type type; + uint32_t base; + uint32_t length; +}; + +#define LPC_SYSRES(type, base, length) \ + static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \ + (type), \ + (base), \ + (length) \ + }; \ + DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__)) + +#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length) +#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) + +int lpc_device_parse(const char *opt); +char *lpc_pirq_name(int pin); +void lpc_pirq_routed(void); + +#endif diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c new file mode 100644 index 0000000000..65e2d9c57d --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -0,0 +1,392 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <sys/disk.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <md5.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTBLK_RINGSZ 64 + +#ifdef __FreeBSD__ +#define VTBLK_MAXSEGS 32 +#else +#define VTBLK_MAXSEGS 16 +#endif + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + uint16_t vbc_geom_c; + uint8_t vbc_geom_h; + uint8_t vbc_geom_s; + uint32_t vbc_blk_size; + uint32_t vbc_sectors_max; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + int vbsc_fd; + struct vtblk_config vbsc_cfg; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; +}; + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !\n")); + vi_reset_dev(&sc->vbsc_vs); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + uint8_t *status; + int i, n; + int err; + int iolen; + int writeop, type; + off_t offset; + struct iovec iov[VTBLK_MAXSEGS + 2]; + uint16_t flags[VTBLK_MAXSEGS + 2]; + + n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= VTBLK_MAXSEGS + 2); + + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = (struct virtio_block_hdr *)iov[0].iov_base; + + status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + offset = vbh->vbh_sector * DEV_BSIZE; + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + + DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read/ident", iolen, i - 1, offset)); + + switch (type) { + case VBH_OP_WRITE: + err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset); + break; + case VBH_OP_READ: + err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + strlcpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + err = 0; + break; + default: + err = -ENOSYS; + break; + } + + /* convert errno into a virtio block error return */ + if (err < 0) { + if (err == -ENOSYS) + *status = VTBLK_S_UNSUPP; + else + *status = VTBLK_S_IOERR; + } else + *status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + vq_relchain(vq, 1); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + vq_startchains(vq); + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct stat sbuf; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int fd; + int sectsz; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * The supplied backing file has to exist + */ + fd = open(opts, O_RDWR); + if (fd < 0) { + perror("Could not open backing file"); + return (1); + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + close(fd); + return (1); + } + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; +#ifdef __FreeBSD__ + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + close(fd); + return (1); + } + assert(size != 0); + assert(sectsz != 0); + } +#endif + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + + /* record fd of storage device/file */ + sc->vbsc_fd = fd; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geom_h = 0; + sc->vbsc_cfg.vbc_geom_s = 0; + sc->vbsc_cfg.vbc_sectors_max = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c new file mode 100644 index 0000000000..e58bdd0115 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -0,0 +1,870 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/select.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <net/ethernet.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <md5.h> +#include <pthread.h> +#include <pthread_np.h> +#ifndef __FreeBSD__ +#include <poll.h> +#include <libdlpi.h> +#endif + +#include "bhyverun.h" +#include "pci_emul.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "virtio.h" + +#define VTNET_RINGSZ 1024 + +#define VTNET_MAXSEGS 32 + +/* + * Host capabilities. Note that we only offer a few of these. + */ +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ + +#define VTNET_S_HOSTCAPS \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY) + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 /* NB: not yet supported */ + +#define VTNET_MAXQ 3 + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + +#ifdef __FreeBSD + int vsc_tapfd; +#else + dlpi_handle_t vsc_dhp; + int vsc_dlpifd; +#endif + int vsc_rx_ready; + volatile int resetting; /* set and checked outside lock */ + + uint32_t vsc_features; + struct virtio_net_config vsc_config; + + pthread_mutex_t rx_mtx; + int rx_in_progress; + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +/* + * If the transmit thread is active then stall until it is done. + */ +static void +pci_vtnet_txwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * If the receive thread is active then stall until it is done. + */ +static void +pci_vtnet_rxwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->rx_mtx); + while (sc->rx_in_progress) { + pthread_mutex_unlock(&sc->rx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->rx_mtx); + } + pthread_mutex_unlock(&sc->rx_mtx); +} + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !\n")); + + sc->resetting = 1; + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); + + sc->vsc_rx_ready = 0; + + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; +} + +/* + * Called to send a buffer chain out to the tap device + */ +#ifdef __FreeBSD__ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} +#else +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + (void) dlpi_send(sc->vsc_dhp, NULL, NULL, + iov[i].iov_base, iov[i].iov_len, NULL); + } +} +#endif + +#ifdef __FreeBSD__ +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; +#endif + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct vqueue_info *vq; + struct virtio_net_rxhdr *vrx; + uint8_t *buf; +#ifdef __FreeBSD__ + int len; +#endif + struct iovec iov[VTNET_MAXSEGS]; +#ifndef __FreeBSD__ + size_t len; + int ret; +#endif + int total_len = 0; + + /* + * Should never be called without a valid tap fd + */ +#ifdef __FreeBSD__ + assert(sc->vsc_tapfd != -1); +#else + assert(sc->vsc_dlpifd != -1); +#endif + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { +#ifdef __FreeBSD__ + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); +#endif + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + vq_startchains(vq); + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ +#ifdef __FreeBSD__ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); +#endif + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain + */ + if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) { + assert(vq_getchain(vq, iov, 1, NULL) == 1); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = (struct virtio_net_rxhdr *)iov[0].iov_base; + buf = (uint8_t *)(vrx + 1); + total_len = iov[0].iov_len; +#ifdef __FreeBSD__ + len = read(sc->vsc_tapfd, buf, + iov[0].iov_len - sizeof(struct virtio_net_rxhdr)); + + if (len < 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_endchains(vq, 0); + return; + } +#else + len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr); + ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, + &len, 0, NULL); + if (ret != DLPI_SUCCESS) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_endchains(vq, 0); + return; + } +#endif + } else { + int i; + int num_segs; + num_segs = vq_getchain(vq, iov, + VTNET_MAXSEGS, NULL); + vrx = (struct virtio_net_rxhrd *)iov[0].iov_base; + total_len = iov[0].iov_len; + for (i = 1; i < num_segs; i++) { + buf = (uint8_t *)iov[i].iov_base; + total_len += iov[i].iov_len; +#ifdef __FreeBSD__ + len = read(sc->vsc_tapfd, buf, iov[i].iov_len); + if (len < 0 && errno == EWOULDBLOCK) { + /* + * No more packets, + * but still some avail ring entries. + * Interrupt if needed/appropriate. + */ + break; + } +#else + len = iov[i].iov_len; + ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, + &len, 0, NULL); + if (ret != DLPI_SUCCESS) { + /* + * No more packets, + * but still some avail ring entries. + * Interrupt if needed/appropriate. + */ + total_len = 0; + break; + } +#endif + } + if (total_len == 0) { + vq_endchains(vq, 0); + return; + } + } + + /* + * The only valid field in the rx packet header is the + * number of buffers, which is always 1 without TSO + * support. + */ + memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); + vrx->vrh_bufs = 1; + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, total_len); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + +#ifdef __FreeBSD__ +static void +pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + +} +#else +static void * +pci_vtnet_poll_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + pollfd_t pollset; + + pollset.fd = sc->vsc_dlpifd; + pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; + + for (;;) { + if (poll(&pollset, 1, -1) < 0) { + if (errno == EINTR) + continue; + fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno); + continue; + } + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtnet_tap_rx(sc); + pthread_mutex_unlock(&sc->vsc_mtx); + } +} +#endif + +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + int i, n; + int plen, tlen; + + /* + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. + */ + n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; + } + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + + /* chain is processed, release it and set tlen */ + vq_relchain(vq, tlen); +} + +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int have_work, error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + do { + if (sc->resetting) + have_work = 0; + else + have_work = vq_has_descs(vq); + + if (!have_work) { + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, + &sc->tx_mtx); + assert(error == 0); + } + } while (!have_work); + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + vq_startchains(vq); + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, 1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} +#endif + +#ifdef __FreeBSD__ +static int +pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} +#endif + + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ +#ifdef __FreeBSD__ + MD5_CTX mdctx; + unsigned char digest[16]; +#else + uchar_t physaddr[DLPI_PHYSADDR_MAX]; + size_t physaddrlen = DLPI_PHYSADDR_MAX; + int error; +#endif + char nstr[80]; + char tname[MAXCOMLEN + 1]; + struct pci_vtnet_softc *sc; + const char *env_msi; + char *devname; + char *vtopts; + int mac_provided; + int use_msix; + + sc = malloc(sizeof(struct pci_vtnet_softc)); + memset(sc, 0, sizeof(struct pci_vtnet_softc)); + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Use MSI if set by user + */ + use_msix = 1; + if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { + if (strcasecmp(env_msi, "yes") == 0) + use_msix = 0; + } + + /* + * Attempt to open the tap device and read the MAC address + * if specified + */ + mac_provided = 0; +#ifdef __FreeBSD__ + sc->vsc_tapfd = -1; +#endif + if (opts != NULL) { + char tbuf[80]; + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + +#ifdef __FreBSD__ + if (vtopts != NULL) { + err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } +#endif + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + free(devname); + +#ifdef __FreeBSD__ + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + } +#else + if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { + WPRINTF(("open of vnic device %s failed\n", opts)); + } + + if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) { + WPRINTF(("read MAC address of vnic device %s failed\n", opts)); + } + if (physaddrlen != ETHERADDRL) { + WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts)); + } + memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); + + if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { + WPRINTF(("bind of vnic device %s failed\n", opts)); + } + + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts)); + } + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts)); + } + + sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); + + if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { + WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts)); + dlpi_close(sc->vsc_dhp); + sc->vsc_dlpifd = -1; + } + + error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); + assert(error == 0); +#endif + } + +#ifdef __FreeBSD__ + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_config.mac[0] = 0x00; + sc->vsc_config.mac[1] = 0xa0; + sc->vsc_config.mac[2] = 0x98; + sc->vsc_config.mac[3] = digest[0]; + sc->vsc_config.mac[4] = digest[1]; + sc->vsc_config.mac[5] = digest[2]; + } +#endif + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + + /* link always up */ + sc->vsc_config.status = 1; + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_in_progress = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot); + pthread_set_name_np(sc->tx_tid, tname); + + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < 6) { + assert(offset + size <= 6); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + return (1); + } + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vnet); diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c new file mode 100644 index 0000000000..f4d5d528be --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_viona.c @@ -0,0 +1,706 @@ +/* + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/ioctl.h> +#include <sys/viona_io.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <signal.h> +#include <poll.h> +#include <libdladm.h> +#include <libdllink.h> +#include <libdlvnic.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VIONA_RINGSZ 1024 + +/* + * PCI config-space register offsets + */ +#define VIONA_R_CFG0 24 +#define VIONA_R_CFG1 25 +#define VIONA_R_CFG2 26 +#define VIONA_R_CFG3 27 +#define VIONA_R_CFG4 28 +#define VIONA_R_CFG5 29 +#define VIONA_R_CFG6 30 +#define VIONA_R_CFG7 31 +#define VIONA_R_MAX 31 + +#define VIONA_REGSZ VIONA_R_MAX+1 + +/* + * Host capabilities + */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ + +#define VIONA_S_HOSTCAPS \ + (VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_STATUS) + +/* + * Queue definitions. + */ +#define VIONA_RXQ 0 +#define VIONA_TXQ 1 +#define VIONA_CTLQ 2 + +#define VIONA_MAXQ 3 + +/* + * Debug printf + */ +static int pci_viona_debug; +#define DPRINTF(params) if (pci_viona_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_viona_softc { + struct pci_devinst *vsc_pi; + pthread_mutex_t vsc_mtx; + + int vsc_curq; + int vsc_status; + int vsc_isr; + + datalink_id_t vsc_linkid; + char vsc_linkname[MAXLINKNAMELEN]; + int vsc_vnafd; + + uint32_t vsc_features; + uint8_t vsc_macaddr[6]; + + uint64_t vsc_pfn[VIONA_MAXQ]; + uint16_t vsc_msix_table_idx[VIONA_MAXQ]; + /* + * Flag to see if host is already sending data out. + * If it is, no need to wait for lock and send interrupt to host + * for new data. + */ + boolean_t vsc_tx_kick_lock_held; + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; +}; +#define viona_ctx(sc) ((sc)->vsc_pi->pi_vmctx) + +/* + * Return the size of IO BAR that maps virtio header and device specific + * region. The size would vary depending on whether MSI-X is enabled or + * not. + */ +static uint64_t +pci_viona_iosize(struct pci_devinst *pi) +{ + if (pci_msix_enabled(pi)) + return (VIONA_REGSZ); + else + return (VIONA_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); +} + +static uint16_t +pci_viona_qsize(int qnum) +{ + /* XXX no ctl queue currently */ + if (qnum == VIONA_CTLQ) { + return (0); + } + + /* XXX fixed currently. Maybe different for tx/rx/ctl */ + return (VIONA_RINGSZ); +} + +static void +pci_viona_ring_reset(struct pci_viona_softc *sc, int ring) +{ + int error; + + assert(ring < VIONA_MAXQ); + + switch (ring) { + case VIONA_RXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET); + if (error != 0) { + WPRINTF(("ioctl viona rx ring reset failed %d\n", + error)); + } else { + sc->vsc_pfn[VIONA_RXQ] = 0; + } + break; + case VIONA_TXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET); + if (error != 0) { + WPRINTF(("ioctl viona tx ring reset failed %d\n", + error)); + } else { + sc->vsc_pfn[VIONA_TXQ] = 0; + } + break; + case VIONA_CTLQ: + default: + break; + } +} + +static void +pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value) +{ + + if (value == 0) { + DPRINTF(("viona: device reset requested !\n")); + pci_viona_ring_reset(sc, VIONA_RXQ); + pci_viona_ring_reset(sc, VIONA_TXQ); + } + + sc->vsc_status = value; +} + +static void * +pci_viona_poll_thread(void *param) +{ + struct pci_viona_softc *sc = param; + pollfd_t pollset; + int error; + + pollset.fd = sc->vsc_vnafd; + pollset.events = POLLIN | POLLOUT; + + for (;;) { + if (poll(&pollset, 1, -1) < 0) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } else { + WPRINTF(("pci_viona_poll_thread poll()" + "error %d\n", errno)); + break; + } + } + if (pollset.revents & POLLIN) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[VIONA_RXQ]); + error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR); + if (error != 0) { + WPRINTF(("ioctl viona rx intr clear failed" + " %d\n", error)); + } + } + + if (pollset.revents & POLLOUT) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[VIONA_TXQ]); + error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR); + if (error != 0) { + WPRINTF(("ioctl viona tx intr clear failed" + " %d\n", error)); + } + } + } + + pthread_exit(NULL); +} + +static void +pci_viona_ping_rxq(struct pci_viona_softc *sc) +{ + int error; + + error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK); + if (error != 0) { + WPRINTF(("ioctl viona rx ring kick failed %d\n", error)); + } +} + +static void * +pci_viona_tx_thread(void *param) +{ + struct pci_viona_softc *sc = (struct pci_viona_softc *)param; + int error; + + pthread_mutex_lock(&sc->tx_mtx); + for (;;) { + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + sc->vsc_tx_kick_lock_held = B_TRUE; + error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK); + if (error != 0) { + WPRINTF(("ioctl viona tx ring kick failed %d\n", + error)); + } + sc->vsc_tx_kick_lock_held = B_FALSE; + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +static void +pci_viona_ping_txq(struct pci_viona_softc *sc) +{ + /* Signal the tx thread for processing */ + if (sc->vsc_tx_kick_lock_held) + return; + pthread_mutex_lock(&sc->tx_mtx); + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +static void +pci_viona_ping_ctlq(struct pci_viona_softc *sc) +{ + DPRINTF(("viona: control qnotify!\n\r")); +} + +static void +pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn) +{ + int qnum = sc->vsc_curq; + vioc_ring_init_t vna_ri; + int error; + + assert(qnum < VIONA_MAXQ); + + sc->vsc_pfn[qnum] = (pfn << VRING_PFN); + + vna_ri.ri_qsize = pci_viona_qsize(qnum); + vna_ri.ri_qaddr = (pfn << VRING_PFN); + + switch (qnum) { + case VIONA_RXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri); + if (error != 0) { + WPRINTF(("ioctl viona rx ring init failed %d\n", + error)); + } + break; + case VIONA_TXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri); + if (error != 0) { + WPRINTF(("ioctl viona tx ring init failed %d\n", + error)); + } + break; + case VIONA_CTLQ: + default: + break; + } +} + +static int +pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) +{ + vioc_create_t vna_create; + char devname[MAXNAMELEN]; + int ctlfd; + int error; + + sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL); + if (sc->vsc_vnafd == -1) { + WPRINTF(("open viona ctl failed\n")); + return (-1); + } + + vna_create.c_linkid = sc->vsc_linkid; + strlcpy(vna_create.c_vmname, vmname, + sizeof (vna_create.c_vmname)); + vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size, + NULL); + vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL), + &vna_create.c_himem_size, NULL); + error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create); + if (error != 0) { + WPRINTF(("ioctl viona create failed %d\n", error)); + return (-1); + } + + return (0); +} + +static int +pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + dladm_handle_t handle; + dladm_status_t status; + dladm_vnic_attr_t attr; + char errmsg[DLADM_STRSIZE]; + int error; + struct pci_viona_softc *sc; + int i; + + if (opts == NULL) { + printf("virtio-viona: vnic required\n"); + return (1); + } + + sc = malloc(sizeof (struct pci_viona_softc)); + memset(sc, 0, sizeof (struct pci_viona_softc)); + + pi->pi_arg = sc; + sc->vsc_pi = pi; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN); + + if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) { + WPRINTF(("could not open /dev/dld")); + free(sc); + return (1); + } + + if (dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid, + NULL, NULL, NULL) != DLADM_STATUS_OK) { + WPRINTF(("dladm_name2info() for %s failed: %s\n", opts, + dladm_status2str(status, errmsg))); + dladm_close(handle); + free(sc); + return (1); + } + + if (dladm_vnic_info(handle, sc->vsc_linkid, &attr, + DLADM_OPT_ACTIVE) != DLADM_STATUS_OK) { + WPRINTF(("dladm_vnic_info() for %s failed: %s\n", opts, + dladm_status2str(status, errmsg))); + dladm_close(handle); + free(sc); + return (1); + } + + sc->vsc_tx_kick_lock_held = B_FALSE; + memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL); + + dladm_close(handle); + + error = pci_viona_viona_init(ctx, sc); + if (error != 0) { + free(sc); + return (1); + } + + error = pthread_create(NULL, NULL, pci_viona_poll_thread, sc); + assert(error == 0); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + + /* MSI-X support */ + for (i = 0; i < VIONA_MAXQ; i++) + sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; + + /* + * BAR 1 used to map MSI-X table and PBA + */ + if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) { + free(sc); + return (1); + } + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ); + + /* + * Initialize tx semaphore & spawn TX processing thread + * As of now, only one thread for TX desc processing is + * spawned. + */ + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc); + + return (0); +} + +/* + * Function pointer array to handle queue notifications + */ +static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = { + pci_viona_ping_rxq, + pci_viona_ping_txq, + pci_viona_ping_ctlq +}; + +static uint64_t +viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) +{ + /* + * Device specific offsets used by guest would change based on + * whether MSI-X capability is enabled or not + */ + if (!pci_msix_enabled(pi)) { + if (offset >= VTCFG_R_MSIX) + return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); + } + + return (offset); +} + +static void +pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_viona_softc *sc = pi->pi_arg; + void *ptr; + int err = 0; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + + assert(baridx == 0); + + if (offset + size > pci_viona_iosize(pi)) { + DPRINTF(("viona_write: 2big, offset %ld size %d\n", + offset, size)); + return; + } + + pthread_mutex_lock(&sc->vsc_mtx); + + offset = viona_adjust_offset(pi, offset); + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value); + if (err != 0) + WPRINTF(("ioctl feature negotiation returned" + " err = %d\n", err)); + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_viona_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + assert(value < VIONA_MAXQ); + sc->vsc_curq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value < VIONA_MAXQ); + (*pci_viona_qnotify[value])(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_viona_update_status(sc, value); + break; + case VTCFG_R_CFGVEC: + assert(size == 2); + sc->vsc_msix_table_idx[VIONA_CTLQ] = value; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VIONA_CTLQ); + sc->vsc_msix_table_idx[sc->vsc_curq] = value; + break; + case VIONA_R_CFG0: + case VIONA_R_CFG1: + case VIONA_R_CFG2: + case VIONA_R_CFG3: + case VIONA_R_CFG4: + case VIONA_R_CFG5: + assert((size + offset) <= (VIONA_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0]; + /* + * The driver is allowed to change the MAC address + */ + sc->vsc_macaddr[offset - VIONA_R_CFG0] = value; + if (size == 1) { + *(uint8_t *)ptr = value; + } else if (size == 2) { + *(uint16_t *)ptr = value; + } else { + *(uint32_t *)ptr = value; + } + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VIONA_R_CFG6: + case VIONA_R_CFG7: + DPRINTF(("viona: write to readonly reg %ld\n\r", offset)); + break; + default: + DPRINTF(("viona: unknown i/o write offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +uint64_t +pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_viona_softc *sc = pi->pi_arg; + void *ptr; + uint64_t value; + int err = 0; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + + assert(baridx == 0); + + if (offset + size > pci_viona_iosize(pi)) { + DPRINTF(("viona_read: 2big, offset %ld size %d\n", + offset, size)); + return (0); + } + + pthread_mutex_lock(&sc->vsc_mtx); + + offset = viona_adjust_offset(pi, offset); + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value); + if (err != 0) + WPRINTF(("ioctl get host features returned" + " err = %d\n", err)); + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; + break; + case VTCFG_R_QNUM: + assert(size == 2); + value = pci_viona_qsize(sc->vsc_curq); + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vsc_isr; + sc->vsc_isr = 0; /* a read clears this flag */ + break; + case VTCFG_R_CFGVEC: + assert(size == 2); + value = sc->vsc_msix_table_idx[VIONA_CTLQ]; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VIONA_CTLQ); + value = sc->vsc_msix_table_idx[sc->vsc_curq]; + break; + case VIONA_R_CFG0: + case VIONA_R_CFG1: + case VIONA_R_CFG2: + case VIONA_R_CFG3: + case VIONA_R_CFG4: + case VIONA_R_CFG5: + assert((size + offset) <= (VIONA_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0]; + if (size == 1) { + value = *(uint8_t *)ptr; + } else if (size == 2) { + value = *(uint16_t *)ptr; + } else { + value = *(uint32_t *)ptr; + } + break; + case VIONA_R_CFG6: + assert(size != 4); + value = 0x01; /* XXX link always up */ + break; + case VIONA_R_CFG7: + assert(size == 1); + value = 0; /* XXX link status in LSB */ + break; + default: + DPRINTF(("viona: unknown i/o read offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); + + return (value); +} + +struct pci_devemu pci_de_viona = { + .pe_emu = "virtio-net-viona", + .pe_init = pci_viona_init, + .pe_barwrite = pci_viona_write, + .pe_barread = pci_viona_read +}; +PCI_EMUL_SET(pci_de_viona); diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c new file mode 100644 index 0000000000..70c4f1fae8 --- /dev/null +++ b/usr/src/cmd/bhyve/pm.c @@ -0,0 +1,333 @@ +/*- + * Copyright (c) 2013 Advanced Computing Technologies LLC + * Written by: John H. Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $"); + +#include <sys/types.h> +#include <machine/vmm.h> + +#include <assert.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <stdlib.h> +#endif +#include <signal.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "pci_irq.h" +#include "pci_lpc.h" + +static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; +#ifdef __FreeBSD__ +static struct mevent *power_button; +static sig_t old_power_handler; +#endif + +/* + * Reset Control register at I/O port 0xcf9. Bit 2 forces a system + * reset when it transitions from 0 to 1. Bit 1 selects the type of + * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard" + * reset. + */ +static int +reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static uint8_t reset_control; + + if (bytes != 1) + return (-1); + if (in) + *eax = reset_control; + else { + reset_control = *eax; + + /* Treat hard and soft resets the same. */ + if (reset_control & 0x4) { +#ifdef __FreeBSD__ + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); +#else + exit(0); +#endif + } + } + return (0); +} +INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler); + +/* + * ACPI's SCI is a level-triggered interrupt. + */ +static int sci_active; + +static void +sci_assert(struct vmctx *ctx) +{ + + if (sci_active) + return; + vm_isa_assert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 1; +} + +static void +sci_deassert(struct vmctx *ctx) +{ + + if (!sci_active) + return; + vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 0; +} + +/* + * Power Management 1 Event Registers + * + * The only power management event supported is a power button upon + * receiving SIGTERM. + */ +static uint16_t pm1_enable, pm1_status; + +#define PM1_TMR_STS 0x0001 +#define PM1_BM_STS 0x0010 +#define PM1_GBL_STS 0x0020 +#define PM1_PWRBTN_STS 0x0100 +#define PM1_SLPBTN_STS 0x0200 +#define PM1_RTC_STS 0x0400 +#define PM1_WAK_STS 0x8000 + +#define PM1_TMR_EN 0x0001 +#define PM1_GBL_EN 0x0020 +#define PM1_PWRBTN_EN 0x0100 +#define PM1_SLPBTN_EN 0x0200 +#define PM1_RTC_EN 0x0400 + +static void +sci_update(struct vmctx *ctx) +{ + int need_sci; + + /* See if the SCI should be active or not. */ + need_sci = 0; + if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS)) + need_sci = 1; + if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS)) + need_sci = 1; + if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS)) + need_sci = 1; + if (need_sci) + sci_assert(ctx); + else + sci_deassert(ctx); +} + +static int +pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_status; + else { + /* + * Writes are only permitted to clear certain bits by + * writing 1 to those flags. + */ + pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS | + PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS)); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} + +static int +pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_enable; + else { + /* + * Only permit certain bits to be set. We never use + * the global lock, but ACPI-CA whines profusely if it + * can't set GBL_EN. + */ + pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler); +INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler); + +#ifdef __FreeBSD__ +static void +power_button_handler(int signal, enum ev_type type, void *arg) +{ + struct vmctx *ctx; + + ctx = arg; + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); +} +#endif + +/* + * Power Management 1 Control Register + * + * This is mostly unimplemented except that we wish to handle writes that + * set SPL_EN to handle S5 (soft power off). + */ +static uint16_t pm1_control; + +#define PM1_SCI_EN 0x0001 +#define PM1_SLP_TYP 0x1c00 +#define PM1_SLP_EN 0x2000 +#define PM1_ALWAYS_ZERO 0xc003 + +static int +pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + if (in) + *eax = pm1_control; + else { + /* + * Various bits are write-only or reserved, so force them + * to zero in pm1_control. Always preserve SCI_EN as OSPM + * can never change it. + */ + pm1_control = (pm1_control & PM1_SCI_EN) | + (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO)); + + /* + * If SLP_EN is set, check for S5. Bhyve's _S5_ method + * says that '5' should be stored in SLP_TYP for S5. + */ + if (*eax & PM1_SLP_EN) { + if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { +#ifdef __FreeBSD__ + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + assert(error == 0 || errno == EALREADY); +#else + exit(0); +#endif + } + } + } + return (0); +} +INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler); +#ifdef __FreeBSD__ +SYSRES_IO(PM1A_EVT_ADDR, 8); +#endif + +/* + * ACPI SMI Command Register + * + * This write-only register is used to enable and disable ACPI. + */ +static int +smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + assert(!in); + if (bytes != 1) + return (-1); + + pthread_mutex_lock(&pm_lock); + switch (*eax) { + case BHYVE_ACPI_ENABLE: + pm1_control |= PM1_SCI_EN; +#ifdef __FreeBSD__ + if (power_button == NULL) { + power_button = mevent_add(SIGTERM, EVF_SIGNAL, + power_button_handler, ctx); + old_power_handler = signal(SIGTERM, SIG_IGN); + } +#endif + break; + case BHYVE_ACPI_DISABLE: + pm1_control &= ~PM1_SCI_EN; +#ifdef __FreeBSD__ + if (power_button != NULL) { + mevent_delete(power_button); + power_button = NULL; + signal(SIGTERM, old_power_handler); + } +#endif + break; + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler); +#ifdef __FreeBSD__ +SYSRES_IO(SMI_CMD, 1); +#endif + +void +sci_init(struct vmctx *ctx) +{ + + /* + * Mark ACPI's SCI as level trigger and bump its use count + * in the PIRQ router. + */ + pci_irq_use(SCI_INT); + vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER); +} diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c new file mode 100644 index 0000000000..92ab24be57 --- /dev/null +++ b/usr/src/cmd/bhyve/pmtmr.c @@ -0,0 +1,212 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $"); + +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/time.h> +#include <machine/cpufunc.h> + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <assert.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <kstat.h> +#endif + +#include "acpi.h" +#include "inout.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +static pthread_mutex_t pmtmr_mtx; +static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT; + +static uint64_t pmtmr_old; + +static uint64_t pmtmr_tscf; +static uint64_t pmtmr_tsc_old; + +#ifdef __FreeBSD__ +static clockid_t clockid = CLOCK_UPTIME_FAST; +static struct timespec pmtmr_uptime_old; + +#define timespecsub(vvp, uvp) \ + do { \ + (vvp)->tv_sec -= (uvp)->tv_sec; \ + (vvp)->tv_nsec -= (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_nsec += 1000000000; \ + } \ + } while (0) + +static uint64_t +timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold) +{ + struct timespec tsdiff; + int64_t nsecs; + + tsdiff = *tsnew; + timespecsub(&tsdiff, tsold); + nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec; + assert(nsecs >= 0); + + return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old); +} +#endif + +static uint64_t +tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old) +{ + + return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old); +} + +static void +pmtmr_init(void) +{ +#ifdef __FreeBSD__ + size_t len; + int smp_tsc, err; + struct timespec tsnew, tsold = { 0 }; + + len = sizeof(smp_tsc); + err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0); + assert(err == 0); + + if (smp_tsc) { + len = sizeof(pmtmr_tscf); + err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len, + NULL, 0); + assert(err == 0); + + pmtmr_tsc_old = rdtsc(); + pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); + } else { + if (getenv("BHYVE_PMTMR_PRECISE") != NULL) + clockid = CLOCK_UPTIME; + + err = clock_gettime(clockid, &tsnew); + assert(err == 0); + + pmtmr_uptime_old = tsnew; + pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold); + } +#else + kstat_ctl_t *kstat_ctl; + kstat_t *kstat; + kstat_named_t *kstat_cpu_freq; + + kstat_ctl = kstat_open(); + kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL); + kstat_read(kstat_ctl, kstat, NULL); + kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz"); + pmtmr_tscf = kstat_cpu_freq->value.ul; + kstat_close(kstat_ctl); + + pmtmr_tsc_old = rdtsc(); + pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); +#endif + pthread_mutex_init(&pmtmr_mtx, NULL); +} + +static uint32_t +pmtmr_val(void) +{ + struct timespec tsnew; + uint64_t pmtmr_tsc_new; + uint64_t pmtmr_new; + int error; + + pthread_once(&pmtmr_once, pmtmr_init); + + pthread_mutex_lock(&pmtmr_mtx); + +#ifdef __FreeBSD__ + if (pmtmr_tscf) { + pmtmr_tsc_new = rdtsc(); + pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); + pmtmr_tsc_old = pmtmr_tsc_new; + } else { + error = clock_gettime(clockid, &tsnew); + assert(error == 0); + + pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old); + pmtmr_uptime_old = tsnew; + } +#else + pmtmr_tsc_new = rdtsc(); + pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); + pmtmr_tsc_old = pmtmr_tsc_new; +#endif + pmtmr_old = pmtmr_new; + + pthread_mutex_unlock(&pmtmr_mtx); + + return (pmtmr_new); +} + +static int +pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 4) + return (-1); + + *eax = pmtmr_val(); + + return (0); +} + +INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler); diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c new file mode 100644 index 0000000000..dcb481aac4 --- /dev/null +++ b/usr/src/cmd/bhyve/post.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $"); + +#include <sys/types.h> + +#include <assert.h> + +#include "inout.h" +#include "pci_lpc.h" + +static int +post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); +SYSRES_IO(0x84, 1); diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c new file mode 100644 index 0000000000..22e566ac21 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2kbd.c @@ -0,0 +1,418 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "atkbdc.h" +#include "console.h" + +/* keyboard device commands */ +#define PS2KC_RESET_DEV 0xff +#define PS2KC_DISABLE 0xf5 +#define PS2KC_ENABLE 0xf4 +#define PS2KC_SET_TYPEMATIC 0xf3 +#define PS2KC_SEND_DEV_ID 0xf2 +#define PS2KC_SET_SCANCODE_SET 0xf0 +#define PS2KC_ECHO 0xee +#define PS2KC_SET_LEDS 0xed + +#define PS2KC_BAT_SUCCESS 0xaa +#define PS2KC_ACK 0xfa + +#define PS2KBD_FIFOSZ 16 + +struct fifo { + uint8_t buf[PS2KBD_FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of bytes in the fifo */ + int size; /* size of the fifo */ +}; + +struct ps2kbd_softc { + struct atkbdc_softc *atkbdc_sc; + pthread_mutex_t mtx; + + bool enabled; + struct fifo fifo; + + uint8_t curcmd; /* current command for next byte */ +}; + +static void +fifo_init(struct ps2kbd_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_reset(struct ps2kbd_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static int +fifo_available(struct ps2kbd_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + return (fifo->num < fifo->size); +} + +static void +fifo_put(struct ps2kbd_softc *sc, uint8_t val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = val; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + } +} + +static int +fifo_get(struct ps2kbd_softc *sc, uint8_t *val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num > 0) { + *val = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + return (0); + } + + return (-1); +} + +int +ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val) +{ + int retval; + + pthread_mutex_lock(&sc->mtx); + retval = fifo_get(sc, val); + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +void +ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val) +{ + pthread_mutex_lock(&sc->mtx); + if (sc->curcmd) { + switch (sc->curcmd) { + case PS2KC_SET_TYPEMATIC: + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_SCANCODE_SET: + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_LEDS: + fifo_put(sc, PS2KC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 keyboard current " + "command byte 0x%02x\n", val); + break; + } + sc->curcmd = 0; + } else { + switch (val) { + case PS2KC_RESET_DEV: + fifo_reset(sc); + fifo_put(sc, PS2KC_ACK); + fifo_put(sc, PS2KC_BAT_SUCCESS); + break; + case PS2KC_DISABLE: + sc->enabled = false; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_ENABLE: + sc->enabled = true; + fifo_reset(sc); + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SET_TYPEMATIC: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_SEND_DEV_ID: + fifo_put(sc, PS2KC_ACK); + fifo_put(sc, 0xab); + fifo_put(sc, 0x83); + break; + case PS2KC_SET_SCANCODE_SET: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + case PS2KC_ECHO: + fifo_put(sc, PS2KC_ECHO); + break; + case PS2KC_SET_LEDS: + sc->curcmd = val; + fifo_put(sc, PS2KC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 keyboard command " + "0x%02x\n", val); + break; + } + } + pthread_mutex_unlock(&sc->mtx); +} + +/* + * Translate keysym to type 2 scancode and insert into keyboard buffer. + */ +static void +ps2kbd_keysym_queue(struct ps2kbd_softc *sc, + int down, uint32_t keysym) +{ + /* ASCII to type 2 scancode lookup table */ + const uint8_t translation[128] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, + 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, + 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, + 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, + 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, + 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, + }; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + switch (keysym) { + case 0x0 ... 0x7f: + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, translation[keysym]); + break; + case 0xff08: /* Back space */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x66); + break; + case 0xff09: /* Tab */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x0d); + break; + case 0xff0d: /* Return */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x5a); + break; + case 0xff1b: /* Escape */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x76); + break; + case 0xff51: /* Left arrow */ + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x6b); + break; + case 0xff52: /* Up arrow */ + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x75); + break; + case 0xff53: /* Right arrow */ + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x74); + break; + case 0xff54: /* Down arrow */ + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x72); + break; + case 0xffbe: /* F1 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x05); + break; + case 0xffbf: /* F2 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x06); + break; + case 0xffc0: /* F3 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x04); + break; + case 0xffc1: /* F4 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x0c); + break; + case 0xffc2: /* F5 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x03); + break; + case 0xffc3: /* F6 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x0b); + break; + case 0xffc4: /* F7 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x83); + break; + case 0xffc5: /* F8 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x0a); + break; + case 0xffc6: /* F9 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x01); + break; + case 0xffc7: /* F10 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x09); + break; + case 0xffc8: /* F11 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x78); + break; + case 0xffc9: /* F12 */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x07); + break; + case 0xffe1: /* Left shift */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x12); + break; + case 0xffe2: /* Right shift */ + /* XXX */ + break; + case 0xffe3: /* Left control */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x14); + break; + case 0xffe4: /* Right control */ + /* XXX */ + break; + case 0xffe7: /* Left meta */ + /* XXX */ + break; + case 0xffe8: /* Right meta */ + /* XXX */ + break; + case 0xffe9: /* Left alt */ + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, 0x11); + break; + case 0xffea: /* Right alt */ + /* XXX */ + break; + default: + fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", + keysym); + break; + } +} + +static void +ps2kbd_event(int down, uint32_t keysym, void *arg) +{ + struct ps2kbd_softc *sc = arg; + + pthread_mutex_lock(&sc->mtx); + if (!sc->enabled) { + pthread_mutex_unlock(&sc->mtx); + return; + } + + ps2kbd_keysym_queue(sc, down, keysym); + pthread_mutex_unlock(&sc->mtx); + + atkbdc_event(sc->atkbdc_sc); +} + +struct ps2kbd_softc * +ps2kbd_init(struct atkbdc_softc *atkbdc_sc) +{ + struct ps2kbd_softc *sc; + + sc = calloc(1, sizeof (struct ps2kbd_softc)); + pthread_mutex_init(&sc->mtx, NULL); + fifo_init(sc); + sc->atkbdc_sc = atkbdc_sc; + + console_kbd_register(ps2kbd_event, sc); + + return (sc); +} diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h new file mode 100644 index 0000000000..34c31b1ea8 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2kbd.h @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PS2KBD_H_ +#define _PS2KBD_H_ + +struct atkbdc_softc; + +struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc); + +int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val); +void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val); + +#endif /* _PS2KBD_H_ */ diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c new file mode 100644 index 0000000000..e96fbbf411 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2mouse.c @@ -0,0 +1,371 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "atkbdc.h" +#include "console.h" + +/* mouse device commands */ +#define PS2MC_RESET_DEV 0xff +#define PS2MC_SET_DEFAULTS 0xf6 +#define PS2MC_DISABLE 0xf5 +#define PS2MC_ENABLE 0xf4 +#define PS2MC_SET_SAMPLING_RATE 0xf3 +#define PS2MC_SEND_DEV_ID 0xf2 +#define PS2MC_SET_REMOTE_MODE 0xf0 +#define PS2MC_SEND_DEV_DATA 0xeb +#define PS2MC_SET_STREAM_MODE 0xea +#define PS2MC_SEND_DEV_STATUS 0xe9 +#define PS2MC_SET_RESOLUTION 0xe8 +#define PS2MC_SET_SCALING1 0xe7 +#define PS2MC_SET_SCALING2 0xe6 + +#define PS2MC_BAT_SUCCESS 0xaa +#define PS2MC_ACK 0xfa + +/* mouse device id */ +#define PS2MOUSE_DEV_ID 0x0 + +/* mouse status bits */ +#define PS2M_STS_REMOTE_MODE 0x40 +#define PS2M_STS_ENABLE_DEV 0x20 +#define PS2M_STS_SCALING_21 0x10 +#define PS2M_STS_MID_BUTTON 0x04 +#define PS2M_STS_RIGHT_BUTTON 0x02 +#define PS2M_STS_LEFT_BUTTON 0x01 + +#define PS2MOUSE_FIFOSZ 16 + +struct fifo { + uint8_t buf[PS2MOUSE_FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of bytes in the fifo */ + int size; /* size of the fifo */ +}; + +struct ps2mouse_softc { + struct atkbdc_softc *atkbdc_sc; + pthread_mutex_t mtx; + + uint8_t status; + uint8_t resolution; + uint8_t sampling_rate; + struct fifo fifo; + + uint8_t curcmd; /* current command for next byte */ + + int cur_x, cur_y; + int delta_x, delta_y; +}; + +static void +fifo_init(struct ps2mouse_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_reset(struct ps2mouse_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = sizeof(((struct fifo *)0)->buf); +} + +static void +fifo_put(struct ps2mouse_softc *sc, uint8_t val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = val; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + } +} + +static int +fifo_get(struct ps2mouse_softc *sc, uint8_t *val) +{ + struct fifo *fifo; + + fifo = &sc->fifo; + if (fifo->num > 0) { + *val = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + return (0); + } + + return (-1); +} + +static void +movement_reset(struct ps2mouse_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + + sc->delta_x = 0; + sc->delta_y = 0; +} + +static void +movement_update(struct ps2mouse_softc *sc, int x, int y) +{ + sc->delta_x += x - sc->cur_x; + sc->delta_y += sc->cur_y - y; + sc->cur_x = x; + sc->cur_y = y; +} + +static void +movement_get(struct ps2mouse_softc *sc) +{ + uint8_t val0, val1, val2; + + assert(pthread_mutex_isowned_np(&sc->mtx)); + + val0 = sc->status & (PS2M_STS_LEFT_BUTTON | + PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); + + if (sc->delta_x >= 0) { + if (sc->delta_x > 255) { + val0 |= (1 << 6); + val1 = 255; + } else + val1 = sc->delta_x; + } else { + val0 |= (1 << 4); + if (sc->delta_x < -255) { + val0 |= (1 << 6); + val1 = 255; + } else + val1 = sc->delta_x; + } + sc->delta_x = 0; + + if (sc->delta_y >= 0) { + if (sc->delta_y > 255) { + val0 |= (1 << 7); + val2 = 255; + } else + val2 = sc->delta_y; + } else { + val0 |= (1 << 5); + if (sc->delta_y < -255) { + val0 |= (1 << 7); + val2 = 255; + } else + val2 = sc->delta_y; + } + sc->delta_y = 0; + + fifo_put(sc, val0); + fifo_put(sc, val1); + fifo_put(sc, val2); +} + +static void +ps2mouse_reset(struct ps2mouse_softc *sc) +{ + assert(pthread_mutex_isowned_np(&sc->mtx)); + fifo_reset(sc); + movement_reset(sc); + sc->status = 0x8; + sc->resolution = 4; + sc->sampling_rate = 100; + + sc->cur_x = 0; + sc->cur_y = 0; + sc->delta_x = 0; + sc->delta_y = 0; +} + +int +ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val) +{ + int retval; + + pthread_mutex_lock(&sc->mtx); + retval = fifo_get(sc, val); + pthread_mutex_unlock(&sc->mtx); + + return (retval); +} + +void +ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) +{ + pthread_mutex_lock(&sc->mtx); + if (sc->curcmd) { + switch (sc->curcmd) { + case PS2MC_SET_SAMPLING_RATE: + sc->sampling_rate = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_RESOLUTION: + sc->resolution = val; + fifo_put(sc, PS2MC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 mouse current " + "command byte 0x%02x\n", val); + break; + } + sc->curcmd = 0; + } else { + switch (val) { + case PS2MC_RESET_DEV: + ps2mouse_reset(sc); + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, PS2MC_BAT_SUCCESS); + fifo_put(sc, PS2MOUSE_DEV_ID); + break; + case PS2MC_SET_DEFAULTS: + ps2mouse_reset(sc); + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_DISABLE: + fifo_reset(sc); + sc->status &= ~PS2M_STS_ENABLE_DEV; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_ENABLE: + fifo_reset(sc); + sc->status |= PS2M_STS_ENABLE_DEV; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_SAMPLING_RATE: + sc->curcmd = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_ID: + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, PS2MOUSE_DEV_ID); + break; + case PS2MC_SET_REMOTE_MODE: + sc->status |= PS2M_STS_REMOTE_MODE; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_DATA: + fifo_put(sc, PS2MC_ACK); + movement_get(sc); + break; + case PS2MC_SET_STREAM_MODE: + sc->status &= ~PS2M_STS_REMOTE_MODE; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SEND_DEV_STATUS: + fifo_put(sc, PS2MC_ACK); + fifo_put(sc, sc->status); + fifo_put(sc, sc->resolution); + fifo_put(sc, sc->sampling_rate); + break; + case PS2MC_SET_RESOLUTION: + sc->curcmd = val; + fifo_put(sc, PS2MC_ACK); + break; + case PS2MC_SET_SCALING1: + case PS2MC_SET_SCALING2: + fifo_put(sc, PS2MC_ACK); + break; + default: + fprintf(stderr, "Unhandled ps2 mouse command " + "0x%02x\n", val); + break; + } + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +ps2mouse_event(uint8_t button, int x, int y, void *arg) +{ + struct ps2mouse_softc *sc = arg; + + pthread_mutex_lock(&sc->mtx); + movement_update(sc, x, y); + + sc->status &= ~(PS2M_STS_LEFT_BUTTON | + PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); + if (button & (1 << 0)) + sc->status |= PS2M_STS_LEFT_BUTTON; + if (button & (1 << 1)) + sc->status |= PS2M_STS_MID_BUTTON; + if (button & (1 << 2)) + sc->status |= PS2M_STS_RIGHT_BUTTON; + + if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) { + /* no data reporting */ + pthread_mutex_unlock(&sc->mtx); + return; + } + + movement_get(sc); + pthread_mutex_unlock(&sc->mtx); + + atkbdc_event(sc->atkbdc_sc); +} + +struct ps2mouse_softc * +ps2mouse_init(struct atkbdc_softc *atkbdc_sc) +{ + struct ps2mouse_softc *sc; + + sc = calloc(1, sizeof (struct ps2mouse_softc)); + pthread_mutex_init(&sc->mtx, NULL); + fifo_init(sc); + sc->atkbdc_sc = atkbdc_sc; + + pthread_mutex_lock(&sc->mtx); + ps2mouse_reset(sc); + pthread_mutex_unlock(&sc->mtx); + + console_ptr_register(ps2mouse_event, sc); + + return (sc); +} + diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h new file mode 100644 index 0000000000..1a78934b98 --- /dev/null +++ b/usr/src/cmd/bhyve/ps2mouse.h @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PS2MOUSE_H_ +#define _PS2MOUSE_H_ + +struct atkbdc_softc; + +struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); + +int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val); +void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val); + +#endif /* _PS2MOUSE_H_ */ diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c new file mode 100644 index 0000000000..0846316378 --- /dev/null +++ b/usr/src/cmd/bhyve/rfb.c @@ -0,0 +1,420 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2015 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/socket.h> +#include <netinet/in.h> + +#include <assert.h> +#include <pthread.h> +#include <signal.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "bhyvegc.h" +#include "console.h" +#include "rfb.h" + +struct rfb_softc { + int sfd; + pthread_t tid; + + int width, height; + + bool enc_raw_ok; + bool enc_resize_ok; +}; + +struct rfb_pixfmt { + uint8_t bpp; + uint8_t depth; + uint8_t bigendian; + uint8_t truecolor; + uint16_t red_max; + uint16_t green_max; + uint16_t blue_max; + uint8_t red_shift; + uint8_t green_shift; + uint8_t blue_shift; + uint8_t pad[3]; +}; + +struct rfb_srvr_info { + uint16_t width; + uint16_t height; + struct rfb_pixfmt pixfmt; + uint32_t namelen; +}; + +struct rfb_pixfmt_msg { + uint8_t type; + uint8_t pad[3]; + struct rfb_pixfmt pixfmt; +}; + +#define RFB_ENCODING_RAW 0 +#define RFB_ENCODING_RESIZE -223 + +struct rfb_enc_msg { + uint8_t type; + uint8_t pad; + uint16_t numencs; +}; + +struct rfb_updt_msg { + uint8_t type; + uint8_t incremental; + uint16_t x; + uint16_t y; + uint16_t width; + uint16_t height; +}; + +struct rfb_key_msg { + uint8_t type; + uint8_t down; + uint16_t pad; + uint32_t code; +}; + +struct rfb_ptr_msg { + uint8_t type; + uint8_t button; + uint16_t x; + uint16_t y; +}; + +struct rfb_srvr_updt_msg { + uint8_t type; + uint8_t pad; + uint16_t numrects; +}; + +struct rfb_srvr_rect_hdr { + uint16_t x; + uint16_t y; + uint16_t width; + uint16_t height; + uint32_t encoding; +}; + +static void +rfb_send_server_init_msg(int cfd) +{ + struct bhyvegc_image *gc_image; + struct rfb_srvr_info sinfo; + int len; + + gc_image = console_get_image(); + + sinfo.width = ntohs(gc_image->width); + sinfo.height = ntohs(gc_image->height); + sinfo.pixfmt.bpp = 32; + sinfo.pixfmt.depth = 32; + sinfo.pixfmt.bigendian = 0; + sinfo.pixfmt.truecolor = 1; + sinfo.pixfmt.red_max = ntohs(255); + sinfo.pixfmt.green_max = ntohs(255); + sinfo.pixfmt.blue_max = ntohs(255); + sinfo.pixfmt.red_shift = 16; + sinfo.pixfmt.green_shift = 8; + sinfo.pixfmt.blue_shift = 0; + sinfo.namelen = ntohl(strlen("bhyve")); + len = write(cfd, &sinfo, sizeof(sinfo)); + len = write(cfd, "bhyve", strlen("bhyve")); +} + +static void +rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = ntohs(1); + write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + + /* Rectangle header */ + srect_hdr.x = ntohs(0); + srect_hdr.y = ntohs(0); + srect_hdr.width = ntohs(rc->width); + srect_hdr.height = ntohs(rc->height); + srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); + write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); +} + +static void +rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_pixfmt_msg pixfmt_msg; + int len; + + len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1); +} + + +static void +rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_enc_msg enc_msg; + int len, i; + uint32_t encoding; + + assert((sizeof(enc_msg) - 1) == 3); + len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1); + + for (i = 0; i < ntohs(enc_msg.numencs); i++) { + len = read(cfd, &encoding, sizeof(encoding)); + switch (ntohl(encoding)) { + case RFB_ENCODING_RAW: + rc->enc_raw_ok = true; + break; + case RFB_ENCODING_RESIZE: + rc->enc_resize_ok = true; + break; + } + } +} + +static void +rfb_resize_update(struct rfb_softc *rc, int fd) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = ntohs(1); + write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg)); + + /* Rectangle header */ + srect_hdr.x = ntohs(0); + srect_hdr.y = ntohs(0); + srect_hdr.width = ntohs(rc->width); + srect_hdr.height = ntohs(rc->height); + srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); + write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr)); +} + +static void +rfb_recv_update_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_updt_msg updt_msg; + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + struct bhyvegc_image *gc_image; + int len; + + len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); + + console_refresh(); + gc_image = console_get_image(); + + if (rc->width != gc_image->width || rc->height != gc_image->height) { + rc->width = gc_image->width; + rc->height = gc_image->height; + rfb_send_resize_update_msg(rc, cfd); + } + + /* + * Send the whole thing + */ + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = ntohs(1); + write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + + /* Rectangle header */ + srect_hdr.x = ntohs(0); + srect_hdr.y = ntohs(0); + srect_hdr.width = ntohs(gc_image->width); + srect_hdr.height = ntohs(gc_image->height); + srect_hdr.encoding = ntohl(0); /* raw */ + write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); + + write(cfd, gc_image->data, gc_image->width * gc_image->height * + sizeof(uint32_t)); +} + +static void +rfb_recv_key_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_key_msg key_msg; + int len; + + len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); + + console_key_event(key_msg.down, ntohl(key_msg.code)); +} + +static void +rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_ptr_msg ptr_msg; + int len; + + len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + + console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y)); +} + +void +rfb_handle(struct rfb_softc *rc, int cfd) +{ + const char *vbuf = "RFB 003.008\n"; + unsigned char buf[80]; + int len; + uint32_t sres; + + /* 1a. Send server version */ + printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf)); + write(cfd, vbuf, strlen(vbuf)); + + /* 1b. Read client version */ + len = read(cfd, buf, sizeof(buf)); + + /* 2a. Send security type 'none' */ + buf[0] = 1; + buf[1] = 1; /* none */ + write(cfd, buf, 2); + + /* 2b. Read agreed security type */ + len = read(cfd, buf, 1); + + /* 2c. Write back a status of 0 */ + sres = 0; + write(cfd, &sres, 4); + + /* 3a. Read client shared-flag byte */ + len = read(cfd, buf, 1); + + /* 4a. Write server-init info */ + rfb_send_server_init_msg(cfd); + + /* Now read in client requests. 1st byte identifies type */ + for (;;) { + len = read(cfd, buf, 1); + if (len <= 0) { + printf("exiting\n"); + break; + } + + switch (buf[0]) { + case 0: + rfb_recv_set_pixfmt_msg(rc, cfd); + break; + case 2: + rfb_recv_set_encodings_msg(rc, cfd); + break; + case 3: + rfb_recv_update_msg(rc, cfd); + break; + case 4: + rfb_recv_key_msg(rc, cfd); + break; + case 5: + rfb_recv_ptr_msg(rc, cfd); + break; + default: + printf("unknown client code!\n"); + exit(1); + } + } +} + +static void * +rfb_thr(void *arg) +{ + struct rfb_softc *rc; + sigset_t set; + + int cfd; + + rc = arg; + + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) { + perror("pthread_sigmask"); + return (NULL); + } + + for (;;) { + cfd = accept(rc->sfd, NULL, NULL); + rfb_handle(rc, cfd); + } + + /* NOTREACHED */ + return (NULL); +} + +int +rfb_init(int port) +{ + struct rfb_softc *rc; + struct sockaddr_in sin; + int on = 1; + + rc = calloc(1, sizeof(struct rfb_softc)); + + rc->sfd = socket(AF_INET, SOCK_STREAM, 0); + if (rc->sfd < 0) { + perror("socket"); + return (-1); + } + + setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + return (-1); + } + + if (listen(rc->sfd, 1) < 0) { + perror("listen"); + return (-1); + } + + pthread_create(&rc->tid, NULL, rfb_thr, rc); + + return (0); +} diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h new file mode 100644 index 0000000000..5504c333ab --- /dev/null +++ b/usr/src/cmd/bhyve/rfb.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RFB_H_ +#define _RFB_H_ + +#define RFB_PORT 5900 + +int rfb_init(int port); + +#endif /* _RFB_H_ */ diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c new file mode 100644 index 0000000000..5ab78e060f --- /dev/null +++ b/usr/src/cmd/bhyve/rtc.c @@ -0,0 +1,380 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $"); + +#include <sys/types.h> +#include <sys/time.h> + +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "acpi.h" +#include "inout.h" +#include "pci_lpc.h" +#include "rtc.h" + +#define IO_RTC 0x70 + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_SEC_ALARM 0x01 +#define RTC_MIN 0x02 +#define RTC_MIN_ALARM 0x03 +#define RTC_HRS 0x04 +#define RTC_HRS_ALARM 0x05 +#define RTC_WDAY 0x06 +#define RTC_DAY 0x07 +#define RTC_MONTH 0x08 +#define RTC_YEAR 0x09 +#define RTC_CENTURY 0x32 /* current century */ + +#define RTC_STATUSA 0xA +#define RTCSA_TUP 0x80 /* time update, don't look now */ + +#define RTC_STATUSB 0xB +#define RTCSB_DST 0x01 +#define RTCSB_24HR 0x02 +#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_NVRAM_START 0x0e +#define RTC_NVRAM_END 0x7f +#define RTC_NVRAM_SZ (128 - RTC_NVRAM_START) +#define nvoff(x) ((x) - RTC_NVRAM_START) + +#define RTC_DIAG 0x0e +#define RTC_RSTCODE 0x0f +#define RTC_EQUIPMENT 0x14 +#define RTC_LMEM_LSB 0x34 +#define RTC_LMEM_MSB 0x35 +#define RTC_HMEM_LSB 0x5b +#define RTC_HMEM_SB 0x5c +#define RTC_HMEM_MSB 0x5d + +#define m_64KB (64*1024) +#define m_16MB (16*1024*1024) +#define m_4GB (4ULL*1024*1024*1024) + +static int addr; + +static uint8_t rtc_nvram[RTC_NVRAM_SZ]; + +/* XXX initialize these to default values as they would be from BIOS */ +static uint8_t status_a, status_b; + +static struct { + uint8_t hours; + uint8_t mins; + uint8_t secs; +} rtc_alarm; + +static u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; +#define bin2bcd(bin) (bin2bcd_data[bin]) + +#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) + +static void +timevalfix(struct timeval *t1) +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} + +static void +timevalsub(struct timeval *t1, const struct timeval *t2) +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static int +rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (bytes != 1) + return (-1); + + if (in) { + /* straight read of this register will return 0xFF */ + *eax = 0xff; + return (0); + } + + switch (*eax & 0x7f) { + case RTC_SEC: + case RTC_SEC_ALARM: + case RTC_MIN: + case RTC_MIN_ALARM: + case RTC_HRS: + case RTC_HRS_ALARM: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + case RTC_STATUSA: + case RTC_STATUSB: + case RTC_INTR: + case RTC_STATUSD: + case RTC_NVRAM_START ... RTC_NVRAM_END: + break; + default: + return (-1); + } + + addr = *eax & 0x7f; + return (0); +} + +static int +rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int hour; + time_t t; + struct timeval cur, delta; + + static struct timeval last; + static struct tm tm; + + if (bytes != 1) + return (-1); + + gettimeofday(&cur, NULL); + + /* + * Increment the cached time only once per second so we can guarantee + * that the guest has at least one second to read the hour:min:sec + * separately and still get a coherent view of the time. + */ + delta = cur; + timevalsub(&delta, &last); + if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { + t = cur.tv_sec; + localtime_r(&t, &tm); + last = cur; + } + + if (in) { + switch (addr) { + case RTC_SEC_ALARM: + *eax = rtc_alarm.secs; + break; + case RTC_MIN_ALARM: + *eax = rtc_alarm.mins; + break; + case RTC_HRS_ALARM: + *eax = rtc_alarm.hours; + break; + case RTC_SEC: + *eax = rtcout(tm.tm_sec); + return (0); + case RTC_MIN: + *eax = rtcout(tm.tm_min); + return (0); + case RTC_HRS: + if (status_b & RTCSB_24HR) + hour = tm.tm_hour; + else + hour = (tm.tm_hour % 12) + 1; + + *eax = rtcout(hour); + + /* + * If we are representing time in the 12-hour format + * then set the MSB to indicate PM. + */ + if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) + *eax |= 0x80; + + return (0); + case RTC_WDAY: + *eax = rtcout(tm.tm_wday + 1); + return (0); + case RTC_DAY: + *eax = rtcout(tm.tm_mday); + return (0); + case RTC_MONTH: + *eax = rtcout(tm.tm_mon + 1); + return (0); + case RTC_YEAR: + *eax = rtcout(tm.tm_year % 100); + return (0); + case RTC_STATUSA: + *eax = status_a; + return (0); + case RTC_STATUSB: + *eax = status_b; + return (0); + case RTC_INTR: + *eax = 0; + return (0); + case RTC_STATUSD: + *eax = RTCSD_PWR; + return (0); + case RTC_NVRAM_START ... RTC_NVRAM_END: + *eax = rtc_nvram[addr - RTC_NVRAM_START]; + return (0); + default: + return (-1); + } + } + + switch (addr) { + case RTC_STATUSA: + status_a = *eax & ~RTCSA_TUP; + break; + case RTC_STATUSB: + /* XXX not implemented yet XXX */ + if (*eax & RTCSB_PINTR) + return (-1); + status_b = *eax; + break; + case RTC_STATUSD: + /* ignore write */ + break; + case RTC_SEC_ALARM: + rtc_alarm.secs = *eax; + break; + case RTC_MIN_ALARM: + rtc_alarm.mins = *eax; + break; + case RTC_HRS_ALARM: + rtc_alarm.hours = *eax; + break; + case RTC_SEC: + case RTC_MIN: + case RTC_HRS: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + /* + * Ignore writes to the time of day registers + */ + break; + case RTC_NVRAM_START ... RTC_NVRAM_END: + rtc_nvram[addr - RTC_NVRAM_START] = *eax; + break; + default: + return (-1); + } + return (0); +} + +void +rtc_init(struct vmctx *ctx) +{ + struct timeval cur; + struct tm tm; + size_t himem; + size_t lomem; + int err; + + err = gettimeofday(&cur, NULL); + assert(err == 0); + (void) localtime_r(&cur.tv_sec, &tm); + + memset(rtc_nvram, 0, sizeof(rtc_nvram)); + + rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100); + + /* XXX init diag/reset code/equipment/checksum ? */ + + /* + * Report guest memory size in nvram cells as required by UEFI. + * Little-endian encoding. + * 0x34/0x35 - 64KB chunks above 16MB, below 4GB + * 0x5b/0x5c/0x5d - 64KB chunks above 4GB + */ + lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; + rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem; + rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8; + + himem = vm_get_highmem_size(ctx) / m_64KB; + rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem; + rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8; + rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16; +} + +INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler); +INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); + +#ifdef __FreeBSD__ +static void +rtc_dsdt(void) +{ + + dsdt_line(""); + dsdt_line("Device (RTC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_RTC, 2); + dsdt_fixed_irq(8); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(rtc_dsdt); +#endif + +SYSRES_IO(0x72, 6); diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h new file mode 100644 index 0000000000..6406d24c37 --- /dev/null +++ b/usr/src/cmd/bhyve/rtc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $ + */ + +#ifndef _RTC_H_ +#define _RTC_H_ + +void rtc_init(struct vmctx *ctx); + +#endif /* _RTC_H_ */ diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c new file mode 100644 index 0000000000..7ba0f0dfa0 --- /dev/null +++ b/usr/src/cmd/bhyve/smbiostbl.c @@ -0,0 +1,827 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $"); + +#include <sys/param.h> + +#include <assert.h> +#include <errno.h> +#include <md5.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <uuid.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "smbiostbl.h" + +#define MB (1024*1024) +#define GB (1024ULL*1024*1024) + +#define SMBIOS_BASE 0xF1000 + +/* BHYVE_ACPI_BASE - SMBIOS_BASE) */ +#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) + +#define SMBIOS_TYPE_BIOS 0 +#define SMBIOS_TYPE_SYSTEM 1 +#define SMBIOS_TYPE_CHASSIS 3 +#define SMBIOS_TYPE_PROCESSOR 4 +#define SMBIOS_TYPE_MEMARRAY 16 +#define SMBIOS_TYPE_MEMDEVICE 17 +#define SMBIOS_TYPE_MEMARRAYMAP 19 +#define SMBIOS_TYPE_BOOT 32 +#define SMBIOS_TYPE_EOT 127 + +struct smbios_structure { + uint8_t type; + uint8_t length; + uint16_t handle; +} __packed; + +typedef int (*initializer_func_t)(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_template_entry { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; +}; + +/* + * SMBIOS Structure Table Entry Point + */ +#define SMBIOS_ENTRY_EANCHOR "_SM_" +#define SMBIOS_ENTRY_EANCHORLEN 4 +#define SMBIOS_ENTRY_IANCHOR "_DMI_" +#define SMBIOS_ENTRY_IANCHORLEN 5 + +struct smbios_entry_point { + char eanchor[4]; /* anchor tag */ + uint8_t echecksum; /* checksum of entry point structure */ + uint8_t eplen; /* length in bytes of entry point */ + uint8_t major; /* major version of the SMBIOS spec */ + uint8_t minor; /* minor version of the SMBIOS spec */ + uint16_t maxssize; /* maximum size in bytes of a struct */ + uint8_t revision; /* entry point structure revision */ + uint8_t format[5]; /* entry point rev-specific data */ + char ianchor[5]; /* intermediate anchor tag */ + uint8_t ichecksum; /* intermediate checksum */ + uint16_t stlen; /* len in bytes of structure table */ + uint32_t staddr; /* physical addr of structure table */ + uint16_t stnum; /* number of structure table entries */ + uint8_t bcdrev; /* BCD value representing DMI ver */ +} __packed; + +/* + * BIOS Information + */ +#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */ +#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */ +#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */ +#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */ +#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */ +#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */ + +#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */ + +#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */ +#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */ + +struct smbios_table_type0 { + struct smbios_structure header; + uint8_t vendor; /* vendor string */ + uint8_t version; /* version string */ + uint16_t segment; /* address segment location */ + uint8_t rel_date; /* release date */ + uint8_t size; /* rom size */ + uint64_t cflags; /* characteristics */ + uint8_t xc_bytes[2]; /* characteristics ext bytes */ + uint8_t sb_major_rel; /* system bios version */ + uint8_t sb_minor_rele; + uint8_t ecfw_major_rel; /* embedded ctrl fw version */ + uint8_t ecfw_minor_rel; +} __packed; + +/* + * System Information + */ +#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */ + +struct smbios_table_type1 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t product; /* product name string */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t uuid[16]; /* uuid byte array */ + uint8_t wakeup; /* wake-up event */ + uint8_t sku; /* sku number string */ + uint8_t family; /* family name string */ +} __packed; + +/* + * System Enclosure or Chassis + */ +#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_CHST_SAFE 0x03 /* safe */ + +#define SMBIOS_CHSC_NONE 0x03 /* none */ + +struct smbios_table_type3 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t type; /* type */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t bustate; /* boot-up state */ + uint8_t psstate; /* power supply state */ + uint8_t tstate; /* thermal state */ + uint8_t security; /* security status */ + uint8_t uheight; /* height in 'u's */ + uint8_t cords; /* number of power cords */ + uint8_t elems; /* number of element records */ + uint8_t elemlen; /* length of records */ + uint8_t sku; /* sku number string */ +} __packed; + +/* + * Processor Information + */ +#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */ + +#define SMBIOS_PRF_OTHER 0x01 /* other */ + +#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */ +#define SMBIOS_PRS_ENABLED 0x1 /* enabled */ + +#define SMBIOS_PRU_NONE 0x06 /* none */ + +#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */ + +struct smbios_table_type4 { + struct smbios_structure header; + uint8_t socket; /* socket designation string */ + uint8_t type; /* processor type */ + uint8_t family; /* processor family */ + uint8_t manufacturer; /* manufacturer string */ + uint64_t cpuid; /* processor cpuid */ + uint8_t version; /* version string */ + uint8_t voltage; /* voltage */ + uint16_t clkspeed; /* ext clock speed in mhz */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint16_t curspeed; /* current speed in mhz */ + uint8_t status; /* status */ + uint8_t upgrade; /* upgrade */ + uint16_t l1handle; /* l1 cache handle */ + uint16_t l2handle; /* l2 cache handle */ + uint16_t l3handle; /* l3 cache handle */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t cores; /* cores per socket */ + uint8_t ecores; /* enabled cores */ + uint8_t threads; /* threads per socket */ + uint16_t cflags; /* processor characteristics */ + uint16_t family2; /* processor family 2 */ +} __packed; + +/* + * Physical Memory Array + */ +#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */ + +#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */ + +#define SMBIOS_MAE_NONE 0x03 /* none */ + +struct smbios_table_type16 { + struct smbios_structure header; + uint8_t location; /* physical device location */ + uint8_t use; /* device functional purpose */ + uint8_t ecc; /* err detect/correct method */ + uint32_t size; /* max mem capacity in kb */ + uint16_t errhand; /* handle of error (if any) */ + uint16_t ndevs; /* num of slots or sockets */ + uint64_t xsize; /* max mem capacity in bytes */ +} __packed; + +/* + * Memory Device + */ +#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */ + +struct smbios_table_type17 { + struct smbios_structure header; + uint16_t arrayhand; /* handle of physl mem array */ + uint16_t errhand; /* handle of mem error data */ + uint16_t twidth; /* total width in bits */ + uint16_t dwidth; /* data width in bits */ + uint16_t size; /* size in bytes */ + uint8_t form; /* form factor */ + uint8_t set; /* set */ + uint8_t dloc; /* device locator string */ + uint8_t bloc; /* phys bank locator string */ + uint8_t type; /* memory type */ + uint16_t flags; /* memory characteristics */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint8_t manufacturer; /* manufacturer string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t attributes; /* attributes */ + uint32_t xsize; /* extended size in mbs */ + uint16_t curspeed; /* current speed in mhz */ + uint16_t minvoltage; /* minimum voltage */ + uint16_t maxvoltage; /* maximum voltage */ + uint16_t curvoltage; /* configured voltage */ +} __packed; + +/* + * Memory Array Mapped Address + */ +struct smbios_table_type19 { + struct smbios_structure header; + uint32_t saddr; /* start phys addr in kb */ + uint32_t eaddr; /* end phys addr in kb */ + uint16_t arrayhand; /* physical mem array handle */ + uint8_t width; /* num of dev in row */ + uint64_t xsaddr; /* start phys addr in bytes */ + uint64_t xeaddr; /* end phys addr in bytes */ +} __packed; + +/* + * System Boot Information + */ +#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */ + +struct smbios_table_type32 { + struct smbios_structure header; + uint8_t reserved[6]; + uint8_t status; /* boot status */ +} __packed; + +/* + * End-of-Table + */ +struct smbios_table_type127 { + struct smbios_structure header; +} __packed; + +struct smbios_table_type0 smbios_type0_template = { + { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 }, + 1, /* bios vendor string */ + 2, /* bios version string */ + 0xF000, /* bios address segment location */ + 3, /* bios release date */ + 0x0, /* bios size (64k * (n + 1) is the size in bytes) */ + SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW | + SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD, + { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM }, + 0x0, /* bios major release */ + 0x0, /* bios minor release */ + 0xff, /* embedded controller firmware major release */ + 0xff /* embedded controller firmware minor release */ +}; + +const char *smbios_type0_strings[] = { + "BHYVE", /* vendor string */ + __TIME__, /* bios version string */ + __DATE__, /* bios release date string */ + NULL +}; + +struct smbios_table_type1 smbios_type1_template = { + { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 }, + 1, /* manufacturer string */ + 2, /* product string */ + 3, /* version string */ + 4, /* serial number string */ + { 0 }, + SMBIOS_WAKEUP_SWITCH, + 5, /* sku string */ + 6 /* family string */ +}; + +static int smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +const char *smbios_type1_strings[] = { + " ", /* manufacturer string */ + "BHYVE", /* product name string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* sku string */ + " ", /* family name string */ + NULL +}; + +struct smbios_table_type3 smbios_type3_template = { + { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 }, + 1, /* manufacturer string */ + SMBIOS_CHT_UNKNOWN, + 2, /* version string */ + 3, /* serial number string */ + 4, /* asset tag string */ + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHSC_NONE, + 0, /* height in 'u's (0=enclosure height unspecified) */ + 0, /* number of power cords (0=number unspecified) */ + 0, /* number of contained element records */ + 0, /* length of records */ + 5 /* sku number string */ +}; + +const char *smbios_type3_strings[] = { + " ", /* manufacturer string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* sku number string */ + NULL +}; + +struct smbios_table_type4 smbios_type4_template = { + { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 }, + 1, /* socket designation string */ + SMBIOS_PRT_CENTRAL, + SMBIOS_PRF_OTHER, + 2, /* manufacturer string */ + 0, /* cpuid */ + 3, /* version string */ + 0, /* voltage */ + 0, /* external clock frequency in mhz (0=unknown) */ + 0, /* maximum frequency in mhz (0=unknown) */ + 0, /* current frequency in mhz (0=unknown) */ + SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED, + SMBIOS_PRU_NONE, + -1, /* l1 cache handle */ + -1, /* l2 cache handle */ + -1, /* l3 cache handle */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* cores per socket (0=unknown) */ + 0, /* enabled cores per socket (0=unknown) */ + 0, /* threads per socket (0=unknown) */ + SMBIOS_PFL_64B, + SMBIOS_PRF_OTHER +}; + +const char *smbios_type4_strings[] = { + " ", /* socket designation string */ + " ", /* manufacturer string */ + " ", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type16 smbios_type16_template = { + { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 }, + SMBIOS_MAL_SYSMB, + SMBIOS_MAU_SYSTEM, + SMBIOS_MAE_NONE, + 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */ + -1, /* handle of error (if any) */ + 0, /* number of slots or sockets (TBD) */ + 0 /* extended maximum memory capacity in bytes (TBD) */ +}; + +static int smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type17 smbios_type17_template = { + { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 }, + -1, /* handle of physical memory array */ + -1, /* handle of memory error data */ + 64, /* total width in bits including ecc */ + 64, /* data width in bits */ + 0x7fff, /* size in bytes (0x7fff=use extended)*/ + SMBIOS_MDFF_UNKNOWN, + 0, /* set (0x00=none, 0xff=unknown) */ + 1, /* device locator string */ + 2, /* physical bank locator string */ + SMBIOS_MDT_UNKNOWN, + SMBIOS_MDF_UNKNOWN, + 0, /* maximum memory speed in mhz (0=unknown) */ + 3, /* manufacturer string */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* attributes (0=unknown rank information) */ + 0, /* extended size in mb (TBD) */ + 0, /* current speed in mhz (0=unknown) */ + 0, /* minimum voltage in mv (0=unknown) */ + 0, /* maximum voltage in mv (0=unknown) */ + 0 /* configured voltage in mv (0=unknown) */ +}; + +const char *smbios_type17_strings[] = { + " ", /* device locator string */ + " ", /* physical bank locator string */ + " ", /* manufacturer string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type19 smbios_type19_template = { + { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 }, + 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */ + 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */ + -1, /* physical memory array handle */ + 1, /* number of devices that form a row */ + 0, /* extended starting phys addr in bytes (TDB) */ + 0 /* extended ending phys addr in bytes (TDB) */ +}; + +static int smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type32 smbios_type32_template = { + { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 }, + { 0, 0, 0, 0, 0, 0 }, + SMBIOS_BOOT_NORMAL +}; + +struct smbios_table_type127 smbios_type127_template = { + { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 } +}; + +static int smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_template_entry smbios_template[] = { + { (struct smbios_structure *)&smbios_type0_template, + smbios_type0_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type1_template, + smbios_type1_strings, + smbios_type1_initializer }, + { (struct smbios_structure *)&smbios_type3_template, + smbios_type3_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type4_template, + smbios_type4_strings, + smbios_type4_initializer }, + { (struct smbios_structure *)&smbios_type16_template, + NULL, + smbios_type16_initializer }, + { (struct smbios_structure *)&smbios_type17_template, + smbios_type17_strings, + smbios_type17_initializer }, + { (struct smbios_structure *)&smbios_type19_template, + NULL, + smbios_type19_initializer }, + { (struct smbios_structure *)&smbios_type32_template, + NULL, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type127_template, + NULL, + smbios_generic_initializer }, + { NULL,NULL, NULL } +}; + +static uint64_t guest_lomem, guest_himem; +static uint16_t type16_handle; + +static int +smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_structure *entry; + + memcpy(curaddr, template_entry, template_entry->length); + entry = (struct smbios_structure *)curaddr; + entry->handle = *n + 1; + curaddr += entry->length; + if (template_strings != NULL) { + int i; + + for (i = 0; template_strings[i] != NULL; i++) { + const char *string; + int len; + + string = template_strings[i]; + len = strlen(string) + 1; + memcpy(curaddr, string, len); + curaddr += len; + } + *curaddr = '\0'; + curaddr++; + } else { + /* Minimum string section is double nul */ + *curaddr = '\0'; + curaddr++; + *curaddr = '\0'; + curaddr++; + } + (*n)++; + *endaddr = curaddr; + + return (0); +} + +static int +smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type1 *type1; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type1 = (struct smbios_table_type1 *)curaddr; + + if (guest_uuid_str != NULL) { + uuid_t uuid; + uint32_t status; + + uuid_from_string(guest_uuid_str, &uuid, &status); + if (status != uuid_s_ok) + return (-1); + + uuid_enc_le(&type1->uuid, &uuid); + } else { + MD5_CTX mdctx; + u_char digest[16]; + char hostname[MAXHOSTNAMELEN]; + + /* + * Universally unique and yet reproducible are an + * oxymoron, however reproducible is desirable in + * this case. + */ + if (gethostname(hostname, sizeof(hostname))) + return (-1); + + MD5Init(&mdctx); + MD5Update(&mdctx, vmname, strlen(vmname)); + MD5Update(&mdctx, hostname, sizeof(hostname)); + MD5Final(digest, &mdctx); + + /* + * Set the variant and version number. + */ + digest[6] &= 0x0F; + digest[6] |= 0x30; /* version 3 */ + digest[8] &= 0x3F; + digest[8] |= 0x80; + + memcpy(&type1->uuid, digest, sizeof (digest)); + } + + return (0); +} + +static int +smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + int i; + + for (i = 0; i < guest_ncpus; i++) { + struct smbios_table_type4 *type4; + char *p; + int nstrings, len; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type4 = (struct smbios_table_type4 *)curaddr; + p = curaddr + sizeof (struct smbios_table_type4); + nstrings = 0; + while (p < *endaddr - 1) { + if (*p++ == '\0') + nstrings++; + } + len = sprintf(*endaddr - 1, "CPU #%d", i) + 1; + *endaddr += len - 1; + *(*endaddr) = '\0'; + (*endaddr)++; + type4->socket = nstrings + 1; + curaddr = *endaddr; + } + + return (0); +} + +static int +smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type16 *type16; + + type16_handle = *n; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type16 = (struct smbios_table_type16 *)curaddr; + type16->xsize = guest_lomem + guest_himem; + type16->ndevs = guest_himem > 0 ? 2 : 1; + + return (0); +} + +static int +smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type17 *type17; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_himem; + } + + return (0); +} + +static int +smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type19 *type19; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 0; + type19->xeaddr = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 4*GB; + type19->xeaddr = guest_himem; + } + + return (0); +} + +static void +smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr) +{ + memset(smbios_ep, 0, sizeof(*smbios_ep)); + memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR, + SMBIOS_ENTRY_EANCHORLEN); + smbios_ep->eplen = 0x1F; + assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen); + smbios_ep->major = 2; + smbios_ep->minor = 6; + smbios_ep->revision = 0; + memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR, + SMBIOS_ENTRY_IANCHORLEN); + smbios_ep->staddr = staddr; + smbios_ep->bcdrev = 0x24; +} + +static void +smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len, + uint16_t num, uint16_t maxssize) +{ + uint8_t checksum; + int i; + + smbios_ep->maxssize = maxssize; + smbios_ep->stlen = len; + smbios_ep->stnum = num; + + checksum = 0; + for (i = 0x10; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->ichecksum = checksum; + + checksum = 0; + for (i = 0; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->echecksum = checksum; +} + +int +smbios_build(struct vmctx *ctx) +{ + struct smbios_entry_point *smbios_ep; + uint16_t n; + uint16_t maxssize; + char *curaddr, *startaddr, *ststartaddr; + int i; + int err; + + guest_lomem = vm_get_lowmem_size(ctx); + guest_himem = vm_get_highmem_size(ctx); + + startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "smbios table requires mapped mem\n"); + return (ENOMEM); + } + + curaddr = startaddr; + + smbios_ep = (struct smbios_entry_point *)curaddr; + smbios_ep_initializer(smbios_ep, SMBIOS_BASE + + sizeof(struct smbios_entry_point)); + curaddr += sizeof(struct smbios_entry_point); + ststartaddr = curaddr; + + n = 0; + maxssize = 0; + for (i = 0; smbios_template[i].entry != NULL; i++) { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; + char *endaddr; + uint16_t size; + + entry = smbios_template[i].entry; + strings = smbios_template[i].strings; + initializer = smbios_template[i].initializer; + + err = (*initializer)(entry, strings, curaddr, &endaddr, + &n, &size); + if (err != 0) + return (err); + + if (size > maxssize) + maxssize = size; + + curaddr = endaddr; + } + + assert(curaddr - startaddr < SMBIOS_MAX_LENGTH); + smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize); + + return (0); +} diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h new file mode 100644 index 0000000000..fd7f86be80 --- /dev/null +++ b/usr/src/cmd/bhyve/smbiostbl.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $ + */ + +#ifndef _SMBIOSTBL_H_ +#define _SMBIOSTBL_H_ + +struct vmctx; + +int smbios_build(struct vmctx *ctx); + +#endif /* _SMBIOSTBL_H_ */ diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c new file mode 100644 index 0000000000..e1dd562d3f --- /dev/null +++ b/usr/src/cmd/bhyve/spinup_ap.c @@ -0,0 +1,104 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $"); + +#include <sys/param.h> +#include <sys/types.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +#include "bhyverun.h" +#include "spinup_ap.h" + +static void +spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip) +{ + int vector, error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + vector = *rip >> PAGE_SHIFT; + *rip = 0; + + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + &desc_limit, &desc_access); + assert(error == 0); + + desc_base = vector << PAGE_SHIFT; + error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + assert(error == 0); + + cs = (vector << PAGE_SHIFT) >> 4; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); +} + +int +spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(newcpu != 0); + assert(newcpu < guest_ncpus); + + error = vcpu_reset(ctx, newcpu); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, newcpu); + + /* + * Enable the 'unrestricted guest' mode for 'newcpu'. + * + * Set up the processor state in power-on 16-bit mode, with the CS:IP + * init'd to the specified low-mem 4K page. + */ + error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + spinup_ap_realmode(ctx, newcpu, &rip); + + fbsdrun_addcpu(ctx, vcpu, newcpu, rip); + + return (newcpu); +} diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h new file mode 100644 index 0000000000..090de091ba --- /dev/null +++ b/usr/src/cmd/bhyve/spinup_ap.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $ + */ + +#ifndef _SPINUP_AP_H_ +#define _SPINUP_AP_H_ + +int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip); + +#endif diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c new file mode 100644 index 0000000000..a8b5d40356 --- /dev/null +++ b/usr/src/cmd/bhyve/uart_emul.c @@ -0,0 +1,1042 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $"); + +#include <sys/types.h> +#include <dev/ic/ns16550.h> + +#ifndef __FreeBSD__ +#include <sys/socket.h> +#include <sys/stat.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <termios.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#endif + +#ifndef __FreeBSD__ +#include <bhyve.h> + +#include "bhyverun.h" +#endif +#ifdef __FreeBSD__ +#include "mevent.h" +#endif +#include "uart_emul.h" + +#define COM1_BASE 0x3F8 +#define COM1_IRQ 4 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 + +#define DEFAULT_RCLK 1843200 +#define DEFAULT_BAUD 9600 + +#define FCR_RX_MASK 0xC0 + +#define MCR_OUT1 0x04 +#define MCR_OUT2 0x08 + +#define MSR_DELTA_MASK 0x0f + +#ifndef REG_SCR +#define REG_SCR com_scr +#endif + +#define FIFOSZ 16 + +static bool uart_stdio; /* stdio in use for i/o */ +#ifndef __FreeBSD__ +static bool uart_bcons; /* bhyveconsole in use for i/o */ +#endif + +static struct { + int baseaddr; + int irq; + bool inuse; +} uart_lres[] = { + { COM1_BASE, COM1_IRQ, false}, + { COM2_BASE, COM2_IRQ, false}, +}; + +#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) + +struct fifo { + uint8_t buf[FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of characters in the fifo */ + int size; /* size of the fifo */ +}; + +struct uart_softc { + pthread_mutex_t mtx; /* protects all softc elements */ + uint8_t data; /* Data register (R/W) */ + uint8_t ier; /* Interrupt enable register (R/W) */ + uint8_t lcr; /* Line control register (R/W) */ + uint8_t mcr; /* Modem control register (R/W) */ + uint8_t lsr; /* Line status register (R/W) */ + uint8_t msr; /* Modem status register (R/W) */ + uint8_t fcr; /* FIFO control register (W) */ + uint8_t scr; /* Scratch register (R/W) */ + + uint8_t dll; /* Baudrate divisor latch LSB */ + uint8_t dlh; /* Baudrate divisor latch MSB */ + + struct fifo rxfifo; + + bool opened; + bool stdio; +#ifndef __FreeBSD__ + bool bcons; + struct { + pid_t clipid; + int clifd; /* console client unix domain socket */ + int servfd; /* console server unix domain socket */ + } usc_bcons; +#endif + + bool thre_int_pending; /* THRE interrupt pending */ + + void *arg; + uart_intr_func_t intr_assert; + uart_intr_func_t intr_deassert; +}; + +#ifdef __FreeBSD__ +static void uart_drain(int fd, enum ev_type ev, void *arg); +#else +static void uart_tty_drain(struct uart_softc *sc); +static int uart_bcons_drain(struct uart_softc *sc); +#endif + +static struct termios tio_orig, tio_new; /* I/O Terminals */ + +static void +ttyclose(void) +{ + + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + + tcgetattr(STDIN_FILENO, &tio_orig); + + tio_new = tio_orig; + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + + (void)write(STDIN_FILENO, &wb, 1); +} + +#ifndef __FreeBSD__ +static void +bconswrite(struct uart_softc *sc, unsigned char wb) +{ + (void) write(sc->usc_bcons.clifd, &wb, 1); +} +#endif + +static void +fifo_reset(struct fifo *fifo, int size) +{ + bzero(fifo, sizeof(struct fifo)); + fifo->size = size; +} + +static int +fifo_putchar(struct fifo *fifo, uint8_t ch) +{ + + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = ch; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + return (0); + } else + return (-1); +} + +static int +fifo_getchar(struct fifo *fifo) +{ + int c; + + if (fifo->num > 0) { + c = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + return (c); + } else + return (-1); +} + +static int +fifo_numchars(struct fifo *fifo) +{ + + return (fifo->num); +} + +static int +fifo_available(struct fifo *fifo) +{ + + return (fifo->num < fifo->size); +} + +static void +uart_opentty(struct uart_softc *sc) +{ + struct mevent *mev; + + assert(!sc->opened && sc->stdio); + + ttyopen(); +#ifdef __FreeBSD__ + mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc); +#endif + assert(mev); +} + +/* + * The IIR returns a prioritized interrupt reason: + * - receive data available + * - transmit holding register empty + * - modem status change + * + * Return an interrupt reason if one is available. + */ +static int +uart_intr_reason(struct uart_softc *sc) +{ + + if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) + return (IIR_RLS); + else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0) + return (IIR_RXTOUT); + else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) + return (IIR_TXRDY); + else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) + return (IIR_MLSC); + else + return (IIR_NOPEND); +} + +static void +uart_reset(struct uart_softc *sc) +{ + uint16_t divisor; + + divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; + sc->dll = divisor; + sc->dlh = divisor >> 16; + + fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */ +} + +/* + * Toggle the COM port's intr pin depending on whether or not we have an + * interrupt condition to report to the processor. + */ +static void +uart_toggle_intr(struct uart_softc *sc) +{ + uint8_t intr_reason; + + intr_reason = uart_intr_reason(sc); + + if (intr_reason == IIR_NOPEND) + (*sc->intr_deassert)(sc->arg); + else + (*sc->intr_assert)(sc->arg); +} + +#ifdef __FreeBSD__ +static void +uart_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc; + int ch; + + sc = arg; + + assert(fd == STDIN_FILENO); + assert(ev == EVF_READ); + + /* + * This routine is called in the context of the mevent thread + * to take out the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) ttyread(); + } else { + while (fifo_available(&sc->rxfifo) && + ((ch = ttyread()) != -1)) { + fifo_putchar(&sc->rxfifo, ch); + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); +} +#else +static void +uart_tty_drain(struct uart_softc *sc) +{ + int ch; + + /* + * Take the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) ttyread(); + } else { + while (fifo_available(&sc->rxfifo) && + ((ch = ttyread()) != -1)) { + fifo_putchar(&sc->rxfifo, ch); + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); +} + +static int +uart_bcons_drain(struct uart_softc *sc) +{ + char ch; + int nbytes; + int ret = 0; + + /* + * Take the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) read(sc->usc_bcons.clifd, &ch, 1); + } else { + for (;;) { + nbytes = read(sc->usc_bcons.clifd, &ch, 1); + if (nbytes == 0) { + ret = 1; + break; + } + if (nbytes == -1 && + errno != EINTR && errno != EAGAIN) { + ret = -1; + break; + } + if (nbytes == -1) { + break; + } + + if (fifo_available(&sc->rxfifo)) { + fifo_putchar(&sc->rxfifo, ch); + } + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); + + return (ret); +} +#endif + +void +uart_write(struct uart_softc *sc, int offset, uint8_t value) +{ + int fifosz; + uint8_t msr; + + pthread_mutex_lock(&sc->mtx); + + /* Open terminal */ + if (!sc->opened && sc->stdio) { + uart_opentty(sc); + sc->opened = true; + } + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + sc->dll = value; + goto done; + } + + if (offset == REG_DLH) { + sc->dlh = value; + goto done; + } + } + + switch (offset) { + case REG_DATA: + if (sc->mcr & MCR_LOOPBACK) { + if (fifo_putchar(&sc->rxfifo, value) != 0) + sc->lsr |= LSR_OE; + } else if (sc->stdio) { + ttywrite(value); +#ifndef __FreeBSD__ + } else if (sc->bcons) { + bconswrite(sc, value); +#endif + } /* else drop on floor */ + sc->thre_int_pending = true; + break; + case REG_IER: + /* + * Apply mask so that bits 4-7 are 0 + * Also enables bits 0-3 only if they're 1 + */ + sc->ier = value & 0x0F; + break; + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + fifo_reset(&sc->rxfifo, fifosz); + } + + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + fifo_reset(&sc->rxfifo, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + + msr = 0; + if (sc->mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the + * MCR are reflected back into MSR + */ + if (sc->mcr & MCR_RTS) + msr |= MSR_CTS; + if (sc->mcr & MCR_DTR) + msr |= MSR_DSR; + if (sc->mcr & MCR_OUT1) + msr |= MSR_RI; + if (sc->mcr & MCR_OUT2) + msr |= MSR_DCD; + } + + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); +} + +uint8_t +uart_read(struct uart_softc *sc, int offset) +{ + uint8_t iir, intr_reason, reg; + + pthread_mutex_lock(&sc->mtx); + + /* Open terminal */ + if (!sc->opened && sc->stdio) { + uart_opentty(sc); + sc->opened = true; + } + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + reg = sc->dll; + goto done; + } + + if (offset == REG_DLH) { + reg = sc->dlh; + goto done; + } + } + + switch (offset) { + case REG_DATA: + reg = fifo_getchar(&sc->rxfifo); + break; + case REG_IER: + reg = sc->ier; + break; + case REG_IIR: + iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; + + intr_reason = uart_intr_reason(sc); + + /* + * Deal with side effects of reading the IIR register + */ + if (intr_reason == IIR_TXRDY) + sc->thre_int_pending = false; + + iir |= intr_reason; + + reg = iir; + break; + case REG_LCR: + reg = sc->lcr; + break; + case REG_MCR: + reg = sc->mcr; + break; + case REG_LSR: + /* Transmitter is always ready for more data */ + sc->lsr |= LSR_TEMT | LSR_THRE; + + /* Check for new receive data */ + if (fifo_numchars(&sc->rxfifo) > 0) + sc->lsr |= LSR_RXRDY; + else + sc->lsr &= ~LSR_RXRDY; + + reg = sc->lsr; + + /* The LSR_OE bit is cleared on LSR read */ + sc->lsr &= ~LSR_OE; + break; + case REG_MSR: + /* + * MSR delta bits are cleared on read + */ + reg = sc->msr; + sc->msr &= ~MSR_DELTA_MASK; + break; + case REG_SCR: + reg = sc->scr; + break; + default: + reg = 0xFF; + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); + + return (reg); +} + +#ifndef __FreeBSD__ +static void * +uart_tty_thread(void *param) +{ + struct uart_softc *sc = param; + pollfd_t pollset; + + pollset.fd = STDIN_FILENO; + pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; + + for (;;) { + if (poll(&pollset, 1, -1) < 0) { + if (errno != EINTR) { + perror("poll failed"); + break; + } + continue; + } + uart_tty_drain(sc); + } + + return (NULL); +} + +/* + * Read the "ident" string from the client's descriptor; this routine also + * tolerates being called with pid=NULL, for times when you want to "eat" + * the ident string from a client without saving it. + */ +static int +get_client_ident(int clifd, pid_t *pid) +{ + char buf[BUFSIZ], *bufp; + size_t buflen = sizeof (buf); + char c = '\0'; + int i = 0, r; + + /* "eat up the ident string" case, for simplicity */ + if (pid == NULL) { + while (read(clifd, &c, 1) == 1) { + if (c == '\n') + return (0); + } + } + + bzero(buf, sizeof (buf)); + while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) { + buflen--; + if (c == '\n') + break; + + buf[i] = c; + i++; + } + if (r == -1) + return (-1); + + /* + * We've filled the buffer, but still haven't seen \n. Keep eating + * until we find it; we don't expect this to happen, but this is + * defensive. + */ + if (c != '\n') { + while ((r = read(clifd, &c, sizeof (c))) > 0) + if (c == '\n') + break; + } + + /* + * Parse buffer for message of the form: IDENT <pid> + */ + bufp = buf; + if (strncmp(bufp, "IDENT ", 6) != 0) + return (-1); + bufp += 6; + errno = 0; + *pid = strtoll(bufp, &bufp, 10); + if (errno != 0) + return (-1); + + return (0); +} + +static int +uart_bcons_accept_client(struct uart_softc *sc) +{ + int connfd; + struct sockaddr_un cliaddr; + socklen_t clilen; + pid_t pid; + + clilen = sizeof (cliaddr); + connfd = accept(sc->usc_bcons.servfd, + (struct sockaddr *)&cliaddr, &clilen); + if (connfd == -1) + return (-1); + if (get_client_ident(connfd, &pid) == -1) { + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + return (-1); + } + + if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + return (-1); + } + (void) write(connfd, "OK\n", 3); + + sc->usc_bcons.clipid = pid; + sc->usc_bcons.clifd = connfd; + + printf("Connection from process ID %lu.\n", pid); + + return (0); +} + +static void +uart_bcons_reject_client(struct uart_softc *sc) +{ + int connfd; + struct sockaddr_un cliaddr; + socklen_t clilen; + char nak[MAXPATHLEN]; + + clilen = sizeof (cliaddr); + connfd = accept(sc->usc_bcons.servfd, + (struct sockaddr *)&cliaddr, &clilen); + + /* + * After hear its ident string, tell client to get lost. + */ + if (get_client_ident(connfd, NULL) == 0) { + (void) snprintf(nak, sizeof (nak), "%lu\n", + sc->usc_bcons.clipid); + (void) write(connfd, nak, strlen(nak)); + } + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); +} + +static int +uart_bcons_client_event(struct uart_softc *sc) +{ + int res; + + res = uart_bcons_drain(sc); + if (res < 0) + return (-1); + + if (res > 0) { + fprintf(stderr, "Closing connection with bhyve console\n"); + (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); + (void) close(sc->usc_bcons.clifd); + sc->usc_bcons.clifd = -1; + } + + return (0); +} + +static void +uart_bcons_server_event(struct uart_softc *sc) +{ + int clifd; + + if (sc->usc_bcons.clifd != -1) { + /* we're already handling a client */ + uart_bcons_reject_client(sc); + return; + } + + if (uart_bcons_accept_client(sc) == 0) { + pthread_mutex_lock(&bcons_wait_lock); + bcons_connected = B_TRUE; + pthread_cond_signal(&bcons_wait_done); + pthread_mutex_unlock(&bcons_wait_lock); + } +} + +static void * +uart_bcons_thread(void *param) +{ + struct uart_softc *sc = param; + struct pollfd pollfds[2]; + int res; + + /* read from client and write to vm */ + pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | + POLLPRI | POLLERR | POLLHUP; + + /* the server socket; watch for events (new connections) */ + pollfds[1].events = pollfds[0].events; + + for (;;) { + pollfds[0].fd = sc->usc_bcons.clifd; + pollfds[1].fd = sc->usc_bcons.servfd; + pollfds[0].revents = pollfds[1].revents = 0; + + res = poll(pollfds, + sizeof (pollfds) / sizeof (struct pollfd), -1); + + if (res == -1 && errno != EINTR) { + perror("poll failed"); + /* we are hosed, close connection */ + break; + } + + /* event from client side */ + if (pollfds[0].revents) { + if (pollfds[0].revents & + (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { + if (uart_bcons_client_event(sc) < 0) + break; + } else { + break; + } + } + + /* event from server socket */ + if (pollfds[1].revents) { + if (pollfds[1].revents & (POLLIN | POLLRDNORM)) { + uart_bcons_server_event(sc); + } else { + break; + } + } + } + + if (sc->usc_bcons.clifd != -1) { + fprintf(stderr, "Closing connection with bhyve console\n"); + (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); + (void) close(sc->usc_bcons.clifd); + sc->usc_bcons.clifd = -1; + } + + return (NULL); +} + +static int +init_bcons_sock(void) +{ + int servfd; + struct sockaddr_un servaddr; + + if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) { + fprintf(stderr, "bhyve console setup: " + "could not mkdir %s", BHYVE_TMPDIR, strerror(errno)); + return (-1); + } + + bzero(&servaddr, sizeof (servaddr)); + servaddr.sun_family = AF_UNIX; + (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), + BHYVE_CONS_SOCKPATH, vmname); + + if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + fprintf(stderr, "bhyve console setup: " + "could not create socket\n"); + return (-1); + } + (void) unlink(servaddr.sun_path); + + if (bind(servfd, (struct sockaddr *)&servaddr, + sizeof (servaddr)) == -1) { + fprintf(stderr, "bhyve console setup: " + "could not bind to socket\n"); + goto out; + } + + if (listen(servfd, 4) == -1) { + fprintf(stderr, "bhyve console setup: " + "could not listen on socket"); + goto out; + } + return (servfd); + +out: + (void) unlink(servaddr.sun_path); + (void) close(servfd); + return (-1); +} +#endif + +int +uart_legacy_alloc(int which, int *baseaddr, int *irq) +{ + + if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) + return (-1); + + uart_lres[which].inuse = true; + *baseaddr = uart_lres[which].baseaddr; + *irq = uart_lres[which].irq; + + return (0); +} + +struct uart_softc * +uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, + void *arg) +{ + struct uart_softc *sc; + + sc = malloc(sizeof(struct uart_softc)); + bzero(sc, sizeof(struct uart_softc)); + + sc->arg = arg; + sc->intr_assert = intr_assert; + sc->intr_deassert = intr_deassert; + + pthread_mutex_init(&sc->mtx, NULL); + + uart_reset(sc); + + return (sc); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ +#ifndef __FreeBSD__ + int error; +#endif + /* + * XXX one stdio backend supported at this time. + */ + if (opts == NULL) + return (0); + +#ifdef __FreeBSD__ + if (strcmp("stdio", opts) == 0 && !uart_stdio) { + sc->stdio = true; + uart_stdio = true; + return (0); +#else + if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) { + sc->stdio = true; + uart_stdio = true; + + error = pthread_create(NULL, NULL, uart_tty_thread, sc); + assert(error == 0); + + return (0); + } else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) { + sc->bcons = true; + uart_bcons= true; + + if (strstr(opts, "bcons,wait") != 0) { + bcons_wait = true; + } + + sc->usc_bcons.clifd = -1; + if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) { + fprintf(stderr, "bhyve console setup: " + "socket initialization failed\n"); + return (-1); + } + error = pthread_create(NULL, NULL, uart_bcons_thread, sc); + assert(error == 0); + + return (0); +#endif + } else + return (-1); +} diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h new file mode 100644 index 0000000000..ecff957991 --- /dev/null +++ b/usr/src/cmd/bhyve/uart_emul.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $ + */ + +#ifndef _UART_EMUL_H_ +#define _UART_EMUL_H_ + + +#define UART_IO_BAR_SIZE 8 + +struct uart_softc; + +typedef void (*uart_intr_func_t)(void *arg); +struct uart_softc *uart_init(uart_intr_func_t intr_assert, + uart_intr_func_t intr_deassert, void *arg); + +int uart_legacy_alloc(int unit, int *ioaddr, int *irq); +uint8_t uart_read(struct uart_softc *sc, int offset); +void uart_write(struct uart_softc *sc, int offset, uint8_t value); +int uart_set_backend(struct uart_softc *sc, const char *opt); +#endif diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c new file mode 100644 index 0000000000..4330741042 --- /dev/null +++ b/usr/src/cmd/bhyve/vga.c @@ -0,0 +1,1289 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <assert.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <machine/vmm.h> + +#include "bhyvegc.h" +#include "console.h" +#include "inout.h" +#include "mem.h" +#include "vga.h" + +#define KB (1024UL) +#define MB (1024 * 1024UL) + +struct vga_softc { + struct mem_range mr; + + struct bhyvegc *gc; + int gc_width; + int gc_height; + struct bhyvegc_image *gc_image; + + uint8_t *vga_ram; + + /* + * General registers + */ + uint8_t vga_misc; + uint8_t vga_sts1; + + /* + * Sequencer + */ + struct { + int seq_index; + uint8_t seq_reset; + uint8_t seq_clock_mode; + int seq_cm_dots; + uint8_t seq_map_mask; + uint8_t seq_cmap_sel; + int seq_cmap_pri_off; + int seq_cmap_sec_off; + uint8_t seq_mm; + } vga_seq; + + /* + * CRT Controller + */ + struct { + int crtc_index; + uint8_t crtc_mode_ctrl; + uint8_t crtc_horiz_total; + uint8_t crtc_horiz_disp_end; + uint8_t crtc_start_horiz_blank; + uint8_t crtc_end_horiz_blank; + uint8_t crtc_start_horiz_retrace; + uint8_t crtc_end_horiz_retrace; + uint8_t crtc_vert_total; + uint8_t crtc_overflow; + uint8_t crtc_present_row_scan; + uint8_t crtc_max_scan_line; + uint8_t crtc_cursor_start; + uint8_t crtc_cursor_on; + uint8_t crtc_cursor_end; + uint8_t crtc_start_addr_high; + uint8_t crtc_start_addr_low; + uint16_t crtc_start_addr; + uint8_t crtc_cursor_loc_low; + uint8_t crtc_cursor_loc_high; + uint16_t crtc_cursor_loc; + uint8_t crtc_vert_retrace_start; + uint8_t crtc_vert_retrace_end; + uint8_t crtc_vert_disp_end; + uint8_t crtc_offset; + uint8_t crtc_underline_loc; + uint8_t crtc_start_vert_blank; + uint8_t crtc_end_vert_blank; + uint8_t crtc_line_compare; + } vga_crtc; + + /* + * Graphics Controller + */ + struct { + int gc_index; + uint8_t gc_set_reset; + uint8_t gc_enb_set_reset; + uint8_t gc_color_compare; + uint8_t gc_rotate; + uint8_t gc_op; + uint8_t gc_read_map_sel; + uint8_t gc_mode; + bool gc_mode_c4; /* chain 4 */ + bool gc_mode_oe; /* odd/even */ + uint8_t gc_mode_rm; /* read mode */ + uint8_t gc_mode_wm; /* write mode */ + uint8_t gc_misc; + uint8_t gc_misc_gm; /* graphics mode */ + uint8_t gc_misc_mm; /* memory map */ + uint8_t gc_color_dont_care; + uint8_t gc_bit_mask; + uint8_t gc_latch0; + uint8_t gc_latch1; + uint8_t gc_latch2; + uint8_t gc_latch3; + } vga_gc; + + /* + * Attribute Controller + */ + struct { + int atc_flipflop; + int atc_index; + uint8_t atc_palette[16]; + uint8_t atc_mode; + uint8_t atc_overscan_color; + uint8_t atc_color_plane_enb; + uint8_t atc_horiz_pixel_panning; + uint8_t atc_color_select; + uint8_t atc_color_select_45; + uint8_t atc_color_select_67; + } vga_atc; + + /* + * DAC + */ + struct { + uint8_t dac_state; + int dac_rd_index; + int dac_rd_subindex; + int dac_wr_index; + int dac_wr_subindex; + uint8_t dac_palette[3 * 256]; + uint32_t dac_palette_rgb[256]; + } vga_dac; +}; + +static bool +vga_in_reset(struct vga_softc *sc) +{ + return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) || + ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) || + ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) || + ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0)); +} + +static void +vga_check_size(struct bhyvegc *gc, struct vga_softc *sc) +{ + int old_width, old_height; + + if (vga_in_reset(sc)) + return; + + old_width = sc->gc_width; + old_height = sc->gc_height; + + /* + * Horizontal Display End: For text modes this is the number + * of characters. For graphics modes this is the number of + * pixels per scanlines divided by the number of pixels per + * character clock. + */ + sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) * + sc->vga_seq.seq_cm_dots; + + sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end | + (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) | + (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1; + + if (old_width != sc->gc_width || old_height != sc->gc_height) + bhyvegc_resize(gc, sc->gc_width, sc->gc_height); +} + +static uint32_t +vga_get_pixel(struct vga_softc *sc, int x, int y) +{ + int offset; + int bit; + uint8_t data; + uint8_t idx; + + offset = (y * sc->gc_width / 8) + (x / 8); + bit = 7 - (x % 8); + + data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) | + (((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) | + (((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) | + (((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3); + + data &= sc->vga_atc.atc_color_plane_enb; + + if (sc->vga_atc.atc_mode & ATC_MC_IPS) { + idx = sc->vga_atc.atc_palette[data] & 0x0f; + idx |= sc->vga_atc.atc_color_select_45; + } else { + idx = sc->vga_atc.atc_palette[data]; + } + idx |= sc->vga_atc.atc_color_select_67; + + return (sc->vga_dac.dac_palette_rgb[idx]); +} + +static void +vga_render_graphics(struct vga_softc *sc) +{ + int x, y; + + for (y = 0; y < sc->gc_height; y++) { + for (x = 0; x < sc->gc_width; x++) { + int offset; + + offset = y * sc->gc_width + x; + sc->gc_image->data[offset] = vga_get_pixel(sc, x, y); + } + } +} + +static uint32_t +vga_get_text_pixel(struct vga_softc *sc, int x, int y) +{ + int dots, offset, bit, font_offset; + uint8_t ch, attr, font; + uint8_t idx; + + dots = sc->vga_seq.seq_cm_dots; + + offset = 2 * sc->vga_crtc.crtc_start_addr; + offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2; + + bit = 7 - (x % dots); + + ch = sc->vga_ram[offset + 0 * 64*KB]; + attr = sc->vga_ram[offset + 1 * 64*KB]; + + if (sc->vga_crtc.crtc_cursor_on && + (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) && + ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) && + ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) { + idx = sc->vga_atc.atc_palette[attr & 0xf]; + return (sc->vga_dac.dac_palette_rgb[idx]); + } + + if ((sc->vga_seq.seq_mm & SEQ_MM_EM) && + sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) { + if (attr & 0x8) + font_offset = sc->vga_seq.seq_cmap_pri_off + + (ch << 5) + y % 16; + else + font_offset = sc->vga_seq.seq_cmap_sec_off + + (ch << 5) + y % 16; + attr &= ~0x8; + } else { + font_offset = (ch << 5) + y % 16; + } + + font = sc->vga_ram[font_offset + 2 * 64*KB]; + + if ((bit > 0) && (font & (1 << bit))) + idx = sc->vga_atc.atc_palette[attr & 0xf]; + else + idx = sc->vga_atc.atc_palette[attr >> 4]; + + return (sc->vga_dac.dac_palette_rgb[idx]); +} + +static void +vga_render_text(struct vga_softc *sc) +{ + int x, y; + + for (y = 0; y < sc->gc_height; y++) { + for (x = 0; x < sc->gc_width; x++) { + int offset; + + offset = y * sc->gc_width + x; + sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y); + } + } +} + +static void +vga_render(struct bhyvegc *gc, void *arg) +{ + struct vga_softc *sc = arg; + + vga_check_size(gc, sc); + + if (vga_in_reset(sc)) { + memset(sc->gc_image->data, 0, + sc->gc_image->width * sc->gc_image->height * + sizeof (uint32_t)); + return; + } + + if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA)) + vga_render_graphics(sc); + else + vga_render_text(sc); +} + +static uint64_t +vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1) +{ + struct vga_softc *sc = arg1; + uint8_t map_sel; + int offset; + + offset = addr; + switch (sc->vga_gc.gc_misc_mm) { + case 0x0: + /* + * extended mode: base 0xa0000 size 128k + */ + offset -=0xa0000; + offset &= (128 * KB - 1); + break; + case 0x1: + /* + * EGA/VGA mode: base 0xa0000 size 64k + */ + offset -=0xa0000; + offset &= (64 * KB - 1); + break; + case 0x2: + /* + * monochrome text mode: base 0xb0000 size 32kb + */ + assert(0); + case 0x3: + /* + * color text mode and CGA: base 0xb8000 size 32kb + */ + offset -=0xb8000; + offset &= (32 * KB - 1); + break; + } + + /* Fill latches. */ + sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB]; + sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB]; + sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB]; + sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB]; + + if (sc->vga_gc.gc_mode_rm) { + /* read mode 1 */ + assert(0); + } + + map_sel = sc->vga_gc.gc_read_map_sel; + if (sc->vga_gc.gc_mode_oe) { + map_sel |= (offset & 1); + offset &= ~1; + } + + /* read mode 0: return the byte from the selected plane. */ + offset += map_sel * 64*KB; + + return (sc->vga_ram[offset]); +} + +static void +vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1) +{ + struct vga_softc *sc = arg1; + uint8_t c0, c1, c2, c3; + uint8_t m0, m1, m2, m3; + uint8_t set_reset; + uint8_t enb_set_reset; + uint8_t mask; + int offset; + + offset = addr; + switch (sc->vga_gc.gc_misc_mm) { + case 0x0: + /* + * extended mode: base 0xa0000 size 128kb + */ + offset -=0xa0000; + offset &= (128 * KB - 1); + break; + case 0x1: + /* + * EGA/VGA mode: base 0xa0000 size 64kb + */ + offset -=0xa0000; + offset &= (64 * KB - 1); + break; + case 0x2: + /* + * monochrome text mode: base 0xb0000 size 32kb + */ + assert(0); + case 0x3: + /* + * color text mode and CGA: base 0xb8000 size 32kb + */ + offset -=0xb8000; + offset &= (32 * KB - 1); + break; + } + + set_reset = sc->vga_gc.gc_set_reset; + enb_set_reset = sc->vga_gc.gc_enb_set_reset; + + c0 = sc->vga_gc.gc_latch0; + c1 = sc->vga_gc.gc_latch1; + c2 = sc->vga_gc.gc_latch2; + c3 = sc->vga_gc.gc_latch3; + + switch (sc->vga_gc.gc_mode_wm) { + case 0: + /* write mode 0 */ + mask = sc->vga_gc.gc_bit_mask; + + val = (val >> sc->vga_gc.gc_rotate) | + (val << (8 - sc->vga_gc.gc_rotate)); + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (set_reset & 1) ? mask : 0x00; + m1 = (set_reset & 2) ? mask : 0x00; + m2 = (set_reset & 4) ? mask : 0x00; + m3 = (set_reset & 8) ? mask : 0x00; + + c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask); + c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask); + c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask); + c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask); + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = set_reset & 1 ? 0xff : ~mask; + m1 = set_reset & 2 ? 0xff : ~mask; + m2 = set_reset & 4 ? 0xff : ~mask; + m3 = set_reset & 8 ? 0xff : ~mask; + + c0 = enb_set_reset & 1 ? c0 & m0 : val & m0; + c1 = enb_set_reset & 2 ? c1 & m1 : val & m1; + c2 = enb_set_reset & 4 ? c2 & m2 : val & m2; + c3 = enb_set_reset & 8 ? c3 & m3 : val & m3; + break; + case 0x10: /* OR */ + m0 = set_reset & 1 ? mask : 0x00; + m1 = set_reset & 2 ? mask : 0x00; + m2 = set_reset & 4 ? mask : 0x00; + m3 = set_reset & 8 ? mask : 0x00; + + c0 = enb_set_reset & 1 ? c0 | m0 : val | m0; + c1 = enb_set_reset & 2 ? c1 | m1 : val | m1; + c2 = enb_set_reset & 4 ? c2 | m2 : val | m2; + c3 = enb_set_reset & 8 ? c3 | m3 : val | m3; + break; + case 0x18: /* XOR */ + m0 = set_reset & 1 ? mask : 0x00; + m1 = set_reset & 2 ? mask : 0x00; + m2 = set_reset & 4 ? mask : 0x00; + m3 = set_reset & 8 ? mask : 0x00; + + c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0; + c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1; + c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2; + c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3; + break; + } + break; + case 1: + /* write mode 1 */ + break; + case 2: + /* write mode 2 */ + mask = sc->vga_gc.gc_bit_mask; + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 &= ~mask; + c1 &= ~mask; + c2 &= ~mask; + c3 &= ~mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = (val & 1 ? 0xff : 0x00) | ~mask; + m1 = (val & 2 ? 0xff : 0x00) | ~mask; + m2 = (val & 4 ? 0xff : 0x00) | ~mask; + m3 = (val & 8 ? 0xff : 0x00) | ~mask; + + c0 &= m0; + c1 &= m1; + c2 &= m2; + c3 &= m3; + break; + case 0x10: /* OR */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x18: /* XOR */ + m0 = (val & 1 ? 0xff : 0x00) & mask; + m1 = (val & 2 ? 0xff : 0x00) & mask; + m2 = (val & 4 ? 0xff : 0x00) & mask; + m3 = (val & 8 ? 0xff : 0x00) & mask; + + c0 ^= m0; + c1 ^= m1; + c2 ^= m2; + c3 ^= m3; + break; + } + break; + case 3: + /* write mode 3 */ + mask = sc->vga_gc.gc_bit_mask & val; + + val = (val >> sc->vga_gc.gc_rotate) | + (val << (8 - sc->vga_gc.gc_rotate)); + + switch (sc->vga_gc.gc_op) { + case 0x00: /* replace */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 &= ~mask; + c1 &= ~mask; + c2 &= ~mask; + c3 &= ~mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x08: /* AND */ + m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask; + m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask; + m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask; + m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask; + + c0 &= m0; + c1 &= m1; + c2 &= m2; + c3 &= m3; + break; + case 0x10: /* OR */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 |= m0; + c1 |= m1; + c2 |= m2; + c3 |= m3; + break; + case 0x18: /* XOR */ + m0 = (set_reset & 1 ? 0xff : 0x00) & mask; + m1 = (set_reset & 2 ? 0xff : 0x00) & mask; + m2 = (set_reset & 4 ? 0xff : 0x00) & mask; + m3 = (set_reset & 8 ? 0xff : 0x00) & mask; + + c0 ^= m0; + c1 ^= m1; + c2 ^= m2; + c3 ^= m3; + break; + } + break; + } + + if (sc->vga_gc.gc_mode_oe) { + if (offset & 1) { + offset &= ~1; + if (sc->vga_seq.seq_map_mask & 2) + sc->vga_ram[offset + 1*64*KB] = c1; + if (sc->vga_seq.seq_map_mask & 8) + sc->vga_ram[offset + 3*64*KB] = c3; + } else { + if (sc->vga_seq.seq_map_mask & 1) + sc->vga_ram[offset + 0*64*KB] = c0; + if (sc->vga_seq.seq_map_mask & 4) + sc->vga_ram[offset + 2*64*KB] = c2; + } + } else { + if (sc->vga_seq.seq_map_mask & 1) + sc->vga_ram[offset + 0*64*KB] = c0; + if (sc->vga_seq.seq_map_mask & 2) + sc->vga_ram[offset + 1*64*KB] = c1; + if (sc->vga_seq.seq_map_mask & 4) + sc->vga_ram[offset + 2*64*KB] = c2; + if (sc->vga_seq.seq_map_mask & 8) + sc->vga_ram[offset + 3*64*KB] = c3; + } +} + +static int +vga_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + if (dir == MEM_F_WRITE) { + switch (size) { + case 1: + vga_mem_wr_handler(ctx, addr, *val, arg1); + break; + case 2: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + break; + case 4: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); + break; + case 8: + vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); + vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1); + vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1); + vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1); + vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1); + break; + } + } else { + switch (size) { + case 1: + *val = vga_mem_rd_handler(ctx, addr, arg1); + break; + case 2: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + break; + case 4: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; + break; + case 8: + *val = vga_mem_rd_handler(ctx, addr, arg1); + *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; + *val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32; + *val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40; + *val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48; + *val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56; + break; + } + } + + return (0); +} + +static int +vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, + uint8_t *val, void *arg) +{ + struct vga_softc *sc = arg; + + switch (port) { + case CRTC_IDX_MONO_PORT: + case CRTC_IDX_COLOR_PORT: + *val = sc->vga_crtc.crtc_index; + break; + case CRTC_DATA_MONO_PORT: + case CRTC_DATA_COLOR_PORT: + switch (sc->vga_crtc.crtc_index) { + case CRTC_HORIZ_TOTAL: + *val = sc->vga_crtc.crtc_horiz_total; + break; + case CRTC_HORIZ_DISP_END: + *val = sc->vga_crtc.crtc_horiz_disp_end; + break; + case CRTC_START_HORIZ_BLANK: + *val = sc->vga_crtc.crtc_start_horiz_blank; + break; + case CRTC_END_HORIZ_BLANK: + *val = sc->vga_crtc.crtc_end_horiz_blank; + break; + case CRTC_START_HORIZ_RETRACE: + *val = sc->vga_crtc.crtc_start_horiz_retrace; + break; + case CRTC_END_HORIZ_RETRACE: + *val = sc->vga_crtc.crtc_end_horiz_retrace; + break; + case CRTC_VERT_TOTAL: + *val = sc->vga_crtc.crtc_vert_total; + break; + case CRTC_OVERFLOW: + *val = sc->vga_crtc.crtc_overflow; + break; + case CRTC_PRESET_ROW_SCAN: + *val = sc->vga_crtc.crtc_present_row_scan; + break; + case CRTC_MAX_SCAN_LINE: + *val = sc->vga_crtc.crtc_max_scan_line; + break; + case CRTC_CURSOR_START: + *val = sc->vga_crtc.crtc_cursor_start; + break; + case CRTC_CURSOR_END: + *val = sc->vga_crtc.crtc_cursor_end; + break; + case CRTC_START_ADDR_HIGH: + *val = sc->vga_crtc.crtc_start_addr_high; + break; + case CRTC_START_ADDR_LOW: + *val = sc->vga_crtc.crtc_start_addr_low; + break; + case CRTC_CURSOR_LOC_HIGH: + *val = sc->vga_crtc.crtc_cursor_loc_high; + break; + case CRTC_CURSOR_LOC_LOW: + *val = sc->vga_crtc.crtc_cursor_loc_low; + break; + case CRTC_VERT_RETRACE_START: + *val = sc->vga_crtc.crtc_vert_retrace_start; + break; + case CRTC_VERT_RETRACE_END: + *val = sc->vga_crtc.crtc_vert_retrace_end; + break; + case CRTC_VERT_DISP_END: + *val = sc->vga_crtc.crtc_vert_disp_end; + break; + case CRTC_OFFSET: + *val = sc->vga_crtc.crtc_offset; + break; + case CRTC_UNDERLINE_LOC: + *val = sc->vga_crtc.crtc_underline_loc; + break; + case CRTC_START_VERT_BLANK: + *val = sc->vga_crtc.crtc_start_vert_blank; + break; + case CRTC_END_VERT_BLANK: + *val = sc->vga_crtc.crtc_end_vert_blank; + break; + case CRTC_MODE_CONTROL: + *val = sc->vga_crtc.crtc_mode_ctrl; + break; + case CRTC_LINE_COMPARE: + *val = sc->vga_crtc.crtc_line_compare; + break; + default: + //printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case ATC_IDX_PORT: + *val = sc->vga_atc.atc_index; + break; + case ATC_DATA_PORT: + switch (sc->vga_atc.atc_index) { + case ATC_PALETTE0 ... ATC_PALETTE15: + *val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index]; + break; + case ATC_MODE_CONTROL: + *val = sc->vga_atc.atc_mode; + break; + case ATC_OVERSCAN_COLOR: + *val = sc->vga_atc.atc_overscan_color; + break; + case ATC_COLOR_PLANE_ENABLE: + *val = sc->vga_atc.atc_color_plane_enb; + break; + case ATC_HORIZ_PIXEL_PANNING: + *val = sc->vga_atc.atc_horiz_pixel_panning; + break; + case ATC_COLOR_SELECT: + *val = sc->vga_atc.atc_color_select; + break; + default: + //printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index); + assert(0); + break; + } + break; + case SEQ_IDX_PORT: + *val = sc->vga_seq.seq_index; + break; + case SEQ_DATA_PORT: + switch (sc->vga_seq.seq_index) { + case SEQ_RESET: + *val = sc->vga_seq.seq_reset; + break; + case SEQ_CLOCKING_MODE: + *val = sc->vga_seq.seq_clock_mode; + break; + case SEQ_MAP_MASK: + *val = sc->vga_seq.seq_map_mask; + break; + case SEQ_CHAR_MAP_SELECT: + *val = sc->vga_seq.seq_cmap_sel; + break; + case SEQ_MEMORY_MODE: + *val = sc->vga_seq.seq_mm; + break; + default: + //printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index); + assert(0); + break; + } + case DAC_DATA_PORT: + *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index + + sc->vga_dac.dac_rd_subindex]; + sc->vga_dac.dac_rd_subindex++; + if (sc->vga_dac.dac_rd_subindex == 3) { + sc->vga_dac.dac_rd_index++; + sc->vga_dac.dac_rd_subindex = 0; + } + break; + case GC_IDX_PORT: + *val = sc->vga_gc.gc_index; + break; + case GC_DATA_PORT: + switch (sc->vga_gc.gc_index) { + case GC_SET_RESET: + *val = sc->vga_gc.gc_set_reset; + break; + case GC_ENABLE_SET_RESET: + *val = sc->vga_gc.gc_enb_set_reset; + break; + case GC_COLOR_COMPARE: + *val = sc->vga_gc.gc_color_compare; + break; + case GC_DATA_ROTATE: + *val = sc->vga_gc.gc_rotate; + break; + case GC_READ_MAP_SELECT: + *val = sc->vga_gc.gc_read_map_sel; + break; + case GC_MODE: + *val = sc->vga_gc.gc_mode; + break; + case GC_MISCELLANEOUS: + *val = sc->vga_gc.gc_misc; + break; + case GC_COLOR_DONT_CARE: + *val = sc->vga_gc.gc_color_dont_care; + break; + case GC_BIT_MASK: + *val = sc->vga_gc.gc_bit_mask; + break; + default: + //printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case GEN_MISC_OUTPUT_PORT: + *val = sc->vga_misc; + break; + case GEN_INPUT_STS0_PORT: + assert(0); + break; + case GEN_INPUT_STS1_MONO_PORT: + case GEN_INPUT_STS1_COLOR_PORT: + sc->vga_atc.atc_flipflop = 0; + sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); + *val = sc->vga_sts1; + break; + case GEN_FEATURE_CTRL_PORT: + assert(0); + break; + default: + printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port); + assert(0); + return (-1); + } + + return (0); +} + +static int +vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, + uint8_t val, void *arg) +{ + struct vga_softc *sc = arg; + + switch (port) { + case CRTC_IDX_MONO_PORT: + case CRTC_IDX_COLOR_PORT: + sc->vga_crtc.crtc_index = val; + break; + case CRTC_DATA_MONO_PORT: + case CRTC_DATA_COLOR_PORT: + switch (sc->vga_crtc.crtc_index) { + case CRTC_HORIZ_TOTAL: + sc->vga_crtc.crtc_horiz_total = val; + break; + case CRTC_HORIZ_DISP_END: + sc->vga_crtc.crtc_horiz_disp_end = val; + break; + case CRTC_START_HORIZ_BLANK: + sc->vga_crtc.crtc_start_horiz_blank = val; + break; + case CRTC_END_HORIZ_BLANK: + sc->vga_crtc.crtc_end_horiz_blank = val; + break; + case CRTC_START_HORIZ_RETRACE: + sc->vga_crtc.crtc_start_horiz_retrace = val; + break; + case CRTC_END_HORIZ_RETRACE: + sc->vga_crtc.crtc_end_horiz_retrace = val; + break; + case CRTC_VERT_TOTAL: + sc->vga_crtc.crtc_vert_total = val; + break; + case CRTC_OVERFLOW: + sc->vga_crtc.crtc_overflow = val; + break; + case CRTC_PRESET_ROW_SCAN: + sc->vga_crtc.crtc_present_row_scan = val; + break; + case CRTC_MAX_SCAN_LINE: + sc->vga_crtc.crtc_max_scan_line = val; + break; + case CRTC_CURSOR_START: + sc->vga_crtc.crtc_cursor_start = val; + sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0; + break; + case CRTC_CURSOR_END: + sc->vga_crtc.crtc_cursor_end = val; + break; + case CRTC_START_ADDR_HIGH: + sc->vga_crtc.crtc_start_addr_high = val; + sc->vga_crtc.crtc_start_addr &= 0x00ff; + sc->vga_crtc.crtc_start_addr |= (val << 8); + break; + case CRTC_START_ADDR_LOW: + sc->vga_crtc.crtc_start_addr_low = val; + sc->vga_crtc.crtc_start_addr &= 0xff00; + sc->vga_crtc.crtc_start_addr |= (val & 0xff); + break; + case CRTC_CURSOR_LOC_HIGH: + sc->vga_crtc.crtc_cursor_loc_high = val; + sc->vga_crtc.crtc_cursor_loc &= 0x00ff; + sc->vga_crtc.crtc_cursor_loc |= (val << 8); + break; + case CRTC_CURSOR_LOC_LOW: + sc->vga_crtc.crtc_cursor_loc_low = val; + sc->vga_crtc.crtc_cursor_loc &= 0xff00; + sc->vga_crtc.crtc_cursor_loc |= (val & 0xff); + break; + case CRTC_VERT_RETRACE_START: + sc->vga_crtc.crtc_vert_retrace_start = val; + break; + case CRTC_VERT_RETRACE_END: + sc->vga_crtc.crtc_vert_retrace_end = val; + break; + case CRTC_VERT_DISP_END: + sc->vga_crtc.crtc_vert_disp_end = val; + break; + case CRTC_OFFSET: + sc->vga_crtc.crtc_offset = val; + break; + case CRTC_UNDERLINE_LOC: + sc->vga_crtc.crtc_underline_loc = val; + break; + case CRTC_START_VERT_BLANK: + sc->vga_crtc.crtc_start_vert_blank = val; + break; + case CRTC_END_VERT_BLANK: + sc->vga_crtc.crtc_end_vert_blank = val; + break; + case CRTC_MODE_CONTROL: + sc->vga_crtc.crtc_mode_ctrl = val; + break; + case CRTC_LINE_COMPARE: + sc->vga_crtc.crtc_line_compare = val; + break; + default: + //printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index); + assert(0); + break; + } + break; + case ATC_IDX_PORT: + if (sc->vga_atc.atc_flipflop == 0) { + if (sc->vga_atc.atc_index & 0x20) + assert(0); + sc->vga_atc.atc_index = val & ATC_IDX_MASK; + } else { + switch (sc->vga_atc.atc_index) { + case ATC_PALETTE0 ... ATC_PALETTE15: + sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f; + break; + case ATC_MODE_CONTROL: + sc->vga_atc.atc_mode = val; + break; + case ATC_OVERSCAN_COLOR: + sc->vga_atc.atc_overscan_color = val; + break; + case ATC_COLOR_PLANE_ENABLE: + sc->vga_atc.atc_color_plane_enb = val; + break; + case ATC_HORIZ_PIXEL_PANNING: + sc->vga_atc.atc_horiz_pixel_panning = val; + break; + case ATC_COLOR_SELECT: + sc->vga_atc.atc_color_select = val; + sc->vga_atc.atc_color_select_45 = + (val & ATC_CS_C45) << 4; + sc->vga_atc.atc_color_select_67 = + (val & ATC_CS_C67) << 6; + break; + default: + //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index); + assert(0); + break; + } + } + sc->vga_atc.atc_flipflop ^= 1; + break; + case ATC_DATA_PORT: + break; + case SEQ_IDX_PORT: + sc->vga_seq.seq_index = val & 0x1f; + break; + case SEQ_DATA_PORT: + switch (sc->vga_seq.seq_index) { + case SEQ_RESET: + sc->vga_seq.seq_reset = val; + break; + case SEQ_CLOCKING_MODE: + sc->vga_seq.seq_clock_mode = val; + sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9; + break; + case SEQ_MAP_MASK: + sc->vga_seq.seq_map_mask = val; + break; + case SEQ_CHAR_MAP_SELECT: + sc->vga_seq.seq_cmap_sel = val; + + sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB; + sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB; + break; + case SEQ_MEMORY_MODE: + sc->vga_seq.seq_mm = val; + assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); + break; + default: + //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index); + assert(0); + break; + } + break; + case DAC_MASK: + break; + case DAC_IDX_RD_PORT: + sc->vga_dac.dac_rd_index = val; + sc->vga_dac.dac_rd_subindex = 0; + break; + case DAC_IDX_WR_PORT: + sc->vga_dac.dac_wr_index = val; + sc->vga_dac.dac_wr_subindex = 0; + break; + case DAC_DATA_PORT: + sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index + + sc->vga_dac.dac_wr_subindex] = val; + sc->vga_dac.dac_wr_subindex++; + if (sc->vga_dac.dac_wr_subindex == 3) { + sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] = + ((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) | + (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) | + (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) | + ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0)); + + sc->vga_dac.dac_wr_index++; + sc->vga_dac.dac_wr_subindex = 0; + } + break; + case GC_IDX_PORT: + sc->vga_gc.gc_index = val; + break; + case GC_DATA_PORT: + switch (sc->vga_gc.gc_index) { + case GC_SET_RESET: + sc->vga_gc.gc_set_reset = val; + break; + case GC_ENABLE_SET_RESET: + sc->vga_gc.gc_enb_set_reset = val; + break; + case GC_COLOR_COMPARE: + sc->vga_gc.gc_color_compare = val; + break; + case GC_DATA_ROTATE: + sc->vga_gc.gc_rotate = val; + sc->vga_gc.gc_op = (val >> 3) & 0x3; + break; + case GC_READ_MAP_SELECT: + sc->vga_gc.gc_read_map_sel = val; + break; + case GC_MODE: + sc->vga_gc.gc_mode = val; + sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0; + assert(!sc->vga_gc.gc_mode_c4); + sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0; + sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1; + sc->vga_gc.gc_mode_wm = val & 0x3; + break; + case GC_MISCELLANEOUS: + sc->vga_gc.gc_misc = val; + sc->vga_gc.gc_misc_gm = val & GC_MISC_GM; + sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >> + GC_MISC_MM_SHIFT; + break; + case GC_COLOR_DONT_CARE: + sc->vga_gc.gc_color_dont_care = val; + break; + case GC_BIT_MASK: + sc->vga_gc.gc_bit_mask = val; + break; + default: + //printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index); + assert(0); + break; + } + break; + case GEN_INPUT_STS0_PORT: + /* write to Miscellaneous Output Register */ + sc->vga_misc = val; + break; + case GEN_INPUT_STS1_MONO_PORT: + case GEN_INPUT_STS1_COLOR_PORT: + /* write to Feature Control Register */ + break; + default: + printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port); + //assert(0); + return (-1); + } + return (0); +} + +static int +vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint8_t val; + int error; + + switch (bytes) { + case 1: + if (in) { + *eax &= ~0xff; + error = vga_port_in_handler(ctx, in, port, 1, + &val, arg); + if (!error) { + *eax |= val & 0xff; + } + } else { + val = *eax & 0xff; + error = vga_port_out_handler(ctx, in, port, 1, + val, arg); + } + break; + case 2: + if (in) { + *eax &= ~0xffff; + error = vga_port_in_handler(ctx, in, port, 1, + &val, arg); + if (!error) { + *eax |= val & 0xff; + } + error = vga_port_in_handler(ctx, in, port + 1, 1, + &val, arg); + if (!error) { + *eax |= (val & 0xff) << 8; + } + } else { + val = *eax & 0xff; + error = vga_port_out_handler(ctx, in, port, 1, + val, arg); + val = (*eax >> 8) & 0xff; + error =vga_port_out_handler(ctx, in, port + 1, 1, + val, arg); + } + break; + default: + assert(0); + return (-1); + } + + return (error); +} + +int +vga_init(void) +{ + struct inout_port iop; + struct vga_softc *sc; + int port, error; + + sc = calloc(1, sizeof(struct vga_softc)); + + bzero(&iop, sizeof(struct inout_port)); + iop.name = "VGA"; + for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) { + iop.port = port; + iop.size = 1; + iop.flags = IOPORT_F_INOUT; + iop.handler = vga_port_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + } + + sc->mr.name = "VGA memory"; + sc->mr.flags = MEM_F_RW; + sc->mr.base = 640 * KB; + sc->mr.size = 128 * KB; + sc->mr.handler = vga_mem_handler; + sc->mr.arg1 = sc; + error = register_mem_fallback(&sc->mr); + assert(error == 0); + + sc->vga_ram = malloc(256 * KB); + memset(sc->vga_ram, 0, 256 * KB); + + sc->gc_image = console_get_image(); + console_fb_register(vga_render, sc); + + return (0); +} diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h new file mode 100644 index 0000000000..14637b12b3 --- /dev/null +++ b/usr/src/cmd/bhyve/vga.h @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VGA_H_ +#define _VGA_H_ + +#define VGA_IOPORT_START 0x3c0 +#define VGA_IOPORT_END 0x3df + +/* General registers */ +#define GEN_INPUT_STS0_PORT 0x3c2 +#define GEN_FEATURE_CTRL_PORT 0x3ca +#define GEN_MISC_OUTPUT_PORT 0x3cc +#define GEN_INPUT_STS1_MONO_PORT 0x3ba +#define GEN_INPUT_STS1_COLOR_PORT 0x3da +#define GEN_IS1_VR 0x08 /* Vertical retrace */ +#define GEN_IS1_DE 0x01 /* Display enable not */ + +/* Attribute controller registers. */ +#define ATC_IDX_PORT 0x3c0 +#define ATC_DATA_PORT 0x3c1 + +#define ATC_IDX_MASK 0x1f +#define ATC_PALETTE0 0 +#define ATC_PALETTE15 15 +#define ATC_MODE_CONTROL 16 +#define ATC_MC_IPS 0x80 /* Internal palette size */ +#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ +#define ATC_OVERSCAN_COLOR 17 +#define ATC_COLOR_PLANE_ENABLE 18 +#define ATC_HORIZ_PIXEL_PANNING 19 +#define ATC_COLOR_SELECT 20 +#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ +#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ + +/* Sequencer registers. */ +#define SEQ_IDX_PORT 0x3c4 +#define SEQ_DATA_PORT 0x3c5 + +#define SEQ_RESET 0 +#define SEQ_RESET_ASYNC 0x1 +#define SEQ_RESET_SYNC 0x2 +#define SEQ_CLOCKING_MODE 1 +#define SEQ_CM_SO 0x20 /* Screen off */ +#define SEQ_CM_89 0x01 /* 8/9 dot clock */ +#define SEQ_MAP_MASK 2 +#define SEQ_CHAR_MAP_SELECT 3 +#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ +#define SEQ_CMS_SAH_SHIFT 5 +#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ +#define SEQ_CMS_SA_SHIFT 2 +#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ +#define SEQ_CMS_SBH_SHIFT 4 +#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ +#define SEQ_CMS_SB_SHIFT 0 +#define SEQ_MEMORY_MODE 4 +#define SEQ_MM_C4 0x08 /* Chain 4 */ +#define SEQ_MM_OE 0x04 /* Odd/even */ +#define SEQ_MM_EM 0x02 /* Extended memory */ + +/* Graphics controller registers. */ +#define GC_IDX_PORT 0x3ce +#define GC_DATA_PORT 0x3cf + +#define GC_SET_RESET 0 +#define GC_ENABLE_SET_RESET 1 +#define GC_COLOR_COMPARE 2 +#define GC_DATA_ROTATE 3 +#define GC_READ_MAP_SELECT 4 +#define GC_MODE 5 +#define GC_MODE_OE 0x10 /* Odd/even */ +#define GC_MODE_C4 0x04 /* Chain 4 */ + +#define GC_MISCELLANEOUS 6 +#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ +#define GC_MISC_MM 0x0c /* memory map */ +#define GC_MISC_MM_SHIFT 2 +#define GC_COLOR_DONT_CARE 7 +#define GC_BIT_MASK 8 + +/* CRT controller registers. */ +#define CRTC_IDX_MONO_PORT 0x3b4 +#define CRTC_DATA_MONO_PORT 0x3b5 +#define CRTC_IDX_COLOR_PORT 0x3d4 +#define CRTC_DATA_COLOR_PORT 0x3d5 + +#define CRTC_HORIZ_TOTAL 0 +#define CRTC_HORIZ_DISP_END 1 +#define CRTC_START_HORIZ_BLANK 2 +#define CRTC_END_HORIZ_BLANK 3 +#define CRTC_START_HORIZ_RETRACE 4 +#define CRTC_END_HORIZ_RETRACE 5 +#define CRTC_VERT_TOTAL 6 +#define CRTC_OVERFLOW 7 +#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ +#define CRTC_OF_VRS9_SHIFT 7 +#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ +#define CRTC_OF_VDE9_SHIFT 6 +#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ +#define CRTC_OF_VRS8_SHIFT 2 +#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ +#define CRTC_OF_VDE8_SHIFT 1 +#define CRTC_PRESET_ROW_SCAN 8 +#define CRTC_MAX_SCAN_LINE 9 +#define CRTC_MSL_MSL 0x1f +#define CRTC_CURSOR_START 10 +#define CRTC_CS_CO 0x20 /* Cursor off */ +#define CRTC_CS_CS 0x1f /* Cursor start */ +#define CRTC_CURSOR_END 11 +#define CRTC_CE_CE 0x1f /* Cursor end */ +#define CRTC_START_ADDR_HIGH 12 +#define CRTC_START_ADDR_LOW 13 +#define CRTC_CURSOR_LOC_HIGH 14 +#define CRTC_CURSOR_LOC_LOW 15 +#define CRTC_VERT_RETRACE_START 16 +#define CRTC_VERT_RETRACE_END 17 +#define CRTC_VRE_MASK 0xf +#define CRTC_VERT_DISP_END 18 +#define CRTC_OFFSET 19 +#define CRTC_UNDERLINE_LOC 20 +#define CRTC_START_VERT_BLANK 21 +#define CRTC_END_VERT_BLANK 22 +#define CRTC_MODE_CONTROL 23 +#define CRTC_MC_TE 0x80 /* Timing enable */ +#define CRTC_LINE_COMPARE 24 + +/* DAC registers */ +#define DAC_MASK 0x3c6 +#define DAC_IDX_RD_PORT 0x3c7 +#define DAC_IDX_WR_PORT 0x3c8 +#define DAC_DATA_PORT 0x3c9 + +int vga_init(void); + +#endif /* _VGA_H_ */ diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c new file mode 100644 index 0000000000..c3b11dc439 --- /dev/null +++ b/usr/src/cmd/bhyve/virtio.c @@ -0,0 +1,755 @@ +/*- + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $"); + +#include <sys/param.h> +#include <sys/uio.h> + +#include <stdio.h> +#include <stdint.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by <https://www.google.com/#output=search&q=virtio+spec> + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_pi = pi; + pi->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + * + * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_pfn = 0; + vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + if (vs->vs_isr) + pci_lintr_deassert(vs->vs_pi); + vs->vs_isr = 0; + vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; +} + +/* + * Set I/O BAR (usually 0) to map PCI config registers. + */ +void +vi_set_io_bar(struct virtio_softc *vs, int barnum) +{ + size_t size; + + /* + * ??? should we use CFG0 if MSI-X is disabled? + * Existing code did not... + */ + size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; + pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); +} + +/* + * Initialize MSI-X vector capabilities if we're to use MSI-X, + * or MSI capabilities if not. + * + * We assume we want one MSI-X vector per queue, here, plus one + * for the config vec. + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + int nvec; + + if (use_msix) { + vs->vs_flags |= VIRTIO_USE_MSIX; + VS_LOCK(vs); + vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ + VS_UNLOCK(vs); + nvec = vs->vs_vc->vc_nvq + 1; + if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) + return (1); + } else + vs->vs_flags &= ~VIRTIO_USE_MSIX; + /* Only 1 MSI vector for bhyve */ + pci_emul_add_msicap(vs->vs_pi, 1); + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn << VRING_PFN; + size = vring_size(vq->vq_qsize); + base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, head, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_pi->pi_vmctx; + head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + next = head; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_negotiated_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain to the guest, setting + * its I/O length to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint32_t iolen) +{ + uint16_t head, uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask]; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = head; /* ie, vue->id = head */ + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + new_idx = vq->vq_used->vu_idx; + old_idx = vq->vq_save_used; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_size; /* size (bytes) */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, + { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, + { VTCFG_R_PFN, 4, 0, "PFN" }, + { VTCFG_R_QNUM, 2, 1, "QNUM" }, + { VTCFG_R_QSEL, 2, 0, "QSEL" }, + { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, + { VTCFG_R_STATUS, 1, 0, "STATUS" }, + { VTCFG_R_ISR, 1, 0, "ISR" }, + { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, + { VTCFG_R_QVEC, 2, 0, "QVEC" }, +}; + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +/* + * Handle pci config space reads. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = pi->pi_arg; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + uint32_t value; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + * If that fails, fall into general code. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size) { + if (cr != NULL) { + /* offset must be OK, so size must be bad */ + fprintf(stderr, + "%s: read from %s: bad size %d\r\n", + name, cr->cr_name, size); + } else { + fprintf(stderr, + "%s: read from bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + value = vc->vc_hv_caps; + break; + case VTCFG_R_GUESTCAP: + value = vs->vs_negotiated_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq < vc->vc_nvq) + value = vs->vs_queues[vs->vs_curq].vq_pfn; + break; + case VTCFG_R_QNUM: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + break; + case VTCFG_R_QSEL: + value = vs->vs_curq; + break; + case VTCFG_R_QNOTIFY: + value = 0; /* XXX */ + break; + case VTCFG_R_STATUS: + value = vs->vs_status; + break; + case VTCFG_R_ISR: + value = vs->vs_isr; + vs->vs_isr = 0; /* a read clears this flag */ + if (value) + pci_lintr_deassert(pi); + break; + case VTCFG_R_CFGVEC: + value = vs->vs_msix_cfg_idx; + break; + case VTCFG_R_QVEC: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_msix_idx : + VIRTIO_MSI_NO_VECTOR; + break; + } +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = pi->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size || cr->cr_ro) { + if (cr != NULL) { + /* offset must be OK, wrong size and/or reg is R/O */ + if (cr->cr_size != size) + fprintf(stderr, + "%s: write to %s: bad size %d\r\n", + name, cr->cr_name, size); + if (cr->cr_ro) + fprintf(stderr, + "%s: write to read-only reg %s\r\n", + name, cr->cr_name); + } else { + fprintf(stderr, + "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vi_vq_init(vs, value); + break; + case VTCFG_R_QSEL: + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VTCFG_R_QNOTIFY: + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + goto done; + } + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + case VTCFG_R_STATUS: + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VTCFG_R_CFGVEC: + vs->vs_msix_cfg_idx = value; + break; + case VTCFG_R_QVEC: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_msix_idx = value; + break; + } + goto done; + +bad_qindex: + fprintf(stderr, + "%s: write config reg %s: curq %d >= max %d\r\n", + name, cr->cr_name, vs->vs_curq, vc->vc_nvq); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h new file mode 100644 index 0000000000..1a2ebe8118 --- /dev/null +++ b/usr/src/cmd/bhyve/virtio.h @@ -0,0 +1,475 @@ +/*- + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": <N> descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; <N> uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; <N> elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number <N> that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If <N> is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The <N> 16-byte "desc" descriptors consist of a 64-bit guest + * physical address <addr>, a 32-bit length <len>, a 16-bit + * <flags>, and a 16-bit <next> field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, <len> is the number of bytes that may + * be read/written from guest physical address <addr>. If + * INDIRECT is set, WRITE is ignored and <len> provides the length + * of the indirect descriptors (and <len> must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each <next> value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each <next> must be in the range [0 .. <len>/16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit <flags> field and 16-bit index <idx>, then + * have <N> 16-bit <ring> values, followed by one final 16-bit + * field <used_event>. The <N> <ring> entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each <next> value). However, <idx> is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod <N> to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit <flags> and 16-bit <idx>, then there + * are <N> "vring_used" elements, followed by a 16-bit <avail_event>. + * The <N> "vring_used" elements consist of a 32-bit <id> and a + * 32-bit <len> (vu_tlen below). The <id> is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the <len> is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, <used_event> and <avail_event>, in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their <flags> field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with <dev/virtio/virtio.h> defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_RANDOM 0x1002 + +/* + * PCI config space constants. + * + * If MSI-X is enabled, the ISR register is generally not used, + * and the configuration vector and queue vector appear at offsets + * 20 and 22 with the remaining configuration registers at 24. + * If MSI-X is not enabled, those two registers disappear and + * the remaining configuration registers start at offset 20. + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, VRING_ALIGN); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, VRING_ALIGN); + + return (size); +} + +struct vmctx; +struct pci_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct pci_devinst *vs_pi; /* PCI device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + uint8_t vs_status; /* value from last status write */ + uint8_t vs_isr; /* ISR flags, if not MSI-X */ + uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ + +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Called by virtio driver as it starts processing chains. Each + * completed chain (obtained from vq_getchain()) is released by + * calling vq_relchain(), then when all are done, vq_endchains() + * can tell if / how-many chains were processed and know whether + * and how to generate an interrupt. + */ +static inline void +vq_startchains(struct vqueue_info *vq) +{ + + vq->vq_save_used = vq->vq_used->vu_idx; +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + + if (pci_msix_enabled(vs->vs_pi)) + pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); + else { + VS_LOCK(vs); + vs->vs_isr |= VTCFG_ISR_QUEUES; + pci_generate_msi(vs->vs_pi, 0); + pci_lintr_assert(vs->vs_pi); + VS_UNLOCK(vs); + } +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_bar(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_relchain(struct vqueue_info *vq, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size); +void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value); +#endif /* _VIRTIO_H_ */ diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c new file mode 100644 index 0000000000..0c097251e0 --- /dev/null +++ b/usr/src/cmd/bhyve/xmsr.c @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $"); + +#include <sys/types.h> + +#include <machine/cpufunc.h> +#include <machine/vmm.h> +#include <machine/specialreg.h> + +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xmsr.h" + +static int cpu_vendor_intel, cpu_vendor_amd; + +int +emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val) +{ + + if (cpu_vendor_intel) { + switch (num) { +#ifndef __FreeBSD__ + case MSR_PERFCTR0: + case MSR_PERFCTR1: + case MSR_EVNTSEL0: + case MSR_EVNTSEL1: + return (0); +#endif + case 0xd04: /* Sandy Bridge uncore PMCs */ + case 0xc24: + return (0); + case MSR_BIOS_UPDT_TRIG: + return (0); + case MSR_BIOS_SIGN: + return (0); + default: + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_HWCR: + /* + * Ignore writes to hardware configuration MSR. + */ + return (0); + + case MSR_NB_CFG1: + case MSR_IC_CFG: + return (0); /* Ignore writes */ + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* Ignore writes to the PerfEvtSel MSRs */ + return (0); + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* Ignore writes to the PerfCtr MSRs */ + return (0); + + case MSR_P_STATE_CONTROL: + /* Ignore write to change the P-state */ + return (0); + + default: + break; + } + } + return (-1); +} + +int +emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) +{ + int error = 0; + + if (cpu_vendor_intel) { + switch (num) { + case MSR_BIOS_SIGN: + case MSR_IA32_PLATFORM_ID: + case MSR_PKG_ENERGY_STATUS: + case MSR_PP0_ENERGY_STATUS: + case MSR_PP1_ENERGY_STATUS: + case MSR_DRAM_ENERGY_STATUS: + *val = 0; + break; + case MSR_RAPL_POWER_UNIT: + /* + * Use the default value documented in section + * "RAPL Interfaces" in Intel SDM vol3. + */ + *val = 0x000a1003; + break; + default: + error = -1; + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_BIOS_SIGN: + *val = 0; + break; + case MSR_HWCR: + /* + * Bios and Kernel Developer's Guides for AMD Families + * 12H, 14H, 15H and 16H. + */ + *val = 0x01000010; /* Reset value */ + *val |= 1 << 9; /* MONITOR/MWAIT disable */ + break; + + case MSR_NB_CFG1: + case MSR_IC_CFG: + /* + * The reset value is processor family dependent so + * just return 0. + */ + *val = 0; + break; + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* + * PerfEvtSel MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* + * PerfCtr MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_SMM_ADDR: + case MSR_SMM_MASK: + /* + * Return the reset value defined in the AMD Bios and + * Kernel Developer's Guide. + */ + *val = 0; + break; + + case MSR_P_STATE_LIMIT: + case MSR_P_STATE_CONTROL: + case MSR_P_STATE_STATUS: + case MSR_P_STATE_CONFIG(0): /* P0 configuration */ + *val = 0; + break; + + /* + * OpenBSD guests test bit 0 of this MSR to detect if the + * workaround for erratum 721 is already applied. + * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + */ + case 0xC0011029: + *val = 1; + break; + + default: + error = -1; + break; + } + } else { + error = -1; + } + return (error); +} + +int +init_msr(void) +{ + int error; + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + error = 0; + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + cpu_vendor_amd = 1; + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + cpu_vendor_intel = 1; + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + error = -1; + } + return (error); +} diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h new file mode 100644 index 0000000000..ac3c147442 --- /dev/null +++ b/usr/src/cmd/bhyve/xmsr.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $ + */ + +#ifndef _XMSR_H_ +#define _XMSR_H_ + +int init_msr(void); +int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); +int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val); + +#endif diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile new file mode 100644 index 0000000000..11d34e6599 --- /dev/null +++ b/usr/src/cmd/bhyveconsole/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.cmd + +SUBDIRS= $(MACH) + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +.KEEP_STATE: + +all: $(SUBDIRS) + +clean clobber lint: $(SUBDIRS) + +install: $(SUBDIRS) + -$(RM) $(ROOTUSRSBINPROG) + -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c new file mode 100644 index 0000000000..7f237a72f6 --- /dev/null +++ b/usr/src/cmd/bhyveconsole/bhyveconsole.c @@ -0,0 +1,360 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/param.h> +#include <sys/signal.h> +#include <sys/socket.h> +#include <sys/termios.h> +#include <assert.h> +#include <errno.h> +#include <libgen.h> +#include <stdarg.h> +#include <stdio.h> +#include <strings.h> +#include <unistd.h> + +#include <bhyve.h> + +static int masterfd; +static struct termios save_termios; +static int save_fd; + +static int nocmdchar = 0; +static char cmdchar = '~'; + +static const char *pname; + +#define BCONS_BUFSIZ 8192 + +static void +usage(void) +{ + (void) fprintf(stderr, "usage: %s vmname\n", pname); + exit(2); +} + +static void +bcons_error(const char *fmt, ...) +{ + va_list alist; + + (void) fprintf(stderr, "%s: ", pname); + va_start(alist, fmt); + (void) vfprintf(stderr, fmt, alist); + va_end(alist); + (void) fprintf(stderr, "\n"); +} + +static void +bcons_perror(const char *str) +{ + const char *estr; + + if ((estr = strerror(errno)) != NULL) + (void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr); + else + (void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno); +} + +/* + * Create the unix domain socket and call bhyve; handshake + * with it to determine whether it will allow us to connect. + */ +static int +get_console(const char *vmname) +{ + int sockfd = -1; + struct sockaddr_un servaddr; + char clientid[MAXPATHLEN]; + char handshake[MAXPATHLEN], c; + int msglen; + int i = 0, err = 0; + + if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + bcons_perror("could not create socket"); + return (-1); + } + + bzero(&servaddr, sizeof (servaddr)); + servaddr.sun_family = AF_UNIX; + (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), + BHYVE_CONS_SOCKPATH, vmname); + + if (connect(sockfd, (struct sockaddr *)&servaddr, + sizeof (servaddr)) == -1) { + bcons_perror("Could not connect to console server"); + goto bad; + } + masterfd = sockfd; + + msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n", + getpid()); + assert(msglen > 0 && msglen < sizeof (clientid)); + + if (write(masterfd, clientid, msglen) != msglen) { + bcons_error("protocol error"); + goto bad; + } + + /* + * Take care not to accumulate more than our fill, and leave room for + * the NUL at the end. + */ + while ((err = read(masterfd, &c, 1)) == 1) { + if (i >= (sizeof (handshake) - 1)) + break; + if (c == '\n') + break; + handshake[i] = c; + i++; + } + handshake[i] = '\0'; + + /* + * If something went wrong during the handshake we bail; perhaps + * the server died off. + */ + if (err == -1) { + bcons_perror("Could not connect to console server"); + goto bad; + } + + if (strncmp(handshake, "OK", sizeof (handshake)) == 0) + return (0); + + bcons_error("Console is already in use by process ID %s.", + handshake); +bad: + (void) close(sockfd); + masterfd = -1; + return (-1); +} + +/* + * Place terminal into raw mode. + */ +static int +set_tty_rawmode(int fd) +{ + struct termios term; + if (tcgetattr(fd, &term) < 0) { + bcons_perror("failed to get user terminal settings"); + return (-1); + } + + /* Stash for later, so we can revert back to previous mode */ + save_termios = term; + save_fd = fd; + + /* disable 8->7 bit strip, start/stop, enable any char to restart */ + term.c_iflag &= ~(ISTRIP|IXON|IXANY); + /* disable NL->CR, CR->NL, ignore CR, UPPER->lower */ + term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC); + /* disable output post-processing */ + term.c_oflag &= ~OPOST; + /* disable canonical mode, signal chars, echo & extended functions */ + term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN); + + term.c_cc[VMIN] = 1; /* byte-at-a-time */ + term.c_cc[VTIME] = 0; + + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) { + bcons_perror("failed to set user terminal to raw mode"); + return (-1); + } + + return (0); +} + +/* + * reset terminal settings for global environment + */ +static void +reset_tty(void) +{ + (void) tcsetattr(save_fd, TCSADRAIN, &save_termios); +} + +/* + * process_user_input watches the input stream for the escape sequence for + * 'quit' (by default, tilde-period). Because we might be fed just one + * keystroke at a time, state associated with the user input (are we at the + * beginning of the line? are we locally echoing the next character?) is + * maintained by beginning_of_line and local_echo across calls to the routine. + * + * This routine returns -1 when the 'quit' escape sequence has been issued, + * or an error is encountered and 0 otherwise. + */ +static int +process_user_input(int out_fd, int in_fd) +{ + static boolean_t beginning_of_line = B_TRUE; + static boolean_t local_echo = B_FALSE; + char ibuf[BCONS_BUFSIZ]; + int nbytes; + char *buf = ibuf; + char c; + + nbytes = read(in_fd, ibuf, sizeof (ibuf)); + if (nbytes == -1 && errno != EINTR) + return (-1); + + if (nbytes == -1) /* The read was interrupted. */ + return (0); + + for (c = *buf; nbytes > 0; c = *buf, --nbytes) { + buf++; + if (beginning_of_line && !nocmdchar) { + beginning_of_line = B_FALSE; + if (c == cmdchar) { + local_echo = B_TRUE; + continue; + } + } else if (local_echo) { + local_echo = B_FALSE; + if (c == '.') { + (void) write(STDOUT_FILENO, &cmdchar, 1); + (void) write(STDOUT_FILENO, &c, 1); + return (-1); + } + } + + (void) write(out_fd, &c, 1); + + beginning_of_line = (c == '\r' || c == '\n'); + } + + return (0); +} + +static int +process_output(int in_fd, int out_fd) +{ + int wrote = 0; + int cc; + char ibuf[BCONS_BUFSIZ]; + + cc = read(in_fd, ibuf, sizeof (ibuf)); + if (cc == -1 && errno != EINTR) + return (-1); + if (cc == 0) /* EOF */ + return (-1); + if (cc == -1) /* The read was interrupted. */ + return (0); + + do { + int len; + + len = write(out_fd, ibuf + wrote, cc - wrote); + if (len == -1 && errno != EINTR) + return (-1); + if (len != -1) + wrote += len; + } while (wrote < cc); + + return (0); +} + +/* + * This is the main I/O loop. + */ +static void +doio(void) +{ + struct pollfd pollfds[2]; + int res; + + /* read from vm and write to stdout */ + pollfds[0].fd = masterfd; + pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI; + + /* read from stdin and write to vm */ + pollfds[1].fd = STDIN_FILENO; + pollfds[1].events = pollfds[0].events; + + for (;;) { + pollfds[0].revents = pollfds[1].revents = 0; + + res = poll(pollfds, + sizeof (pollfds) / sizeof (struct pollfd), -1); + + if (res == -1 && errno != EINTR) { + bcons_perror("poll failed"); + /* we are hosed, close connection */ + break; + } + + /* event from master side stdout */ + if (pollfds[0].revents) { + if (pollfds[0].revents & + (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { + if (process_output(masterfd, STDOUT_FILENO) + != 0) + break; + } else { + break; + } + } + + /* event from user stdin side */ + if (pollfds[1].revents) { + if (pollfds[1].revents & + (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { + if (process_user_input(masterfd, STDIN_FILENO) + != 0) + break; + } else { + break; + } + } + } +} + +int +main(int argc, char **argv) +{ + char *vmname; + + pname = basename(argv[0]); + + if (argc == 2) { + vmname = argv[1]; + } else { + usage(); + } + + /* + * Make contact with bhyve + */ + if (get_console(vmname) == -1) + return (1); + + (void) printf("[Connected to vm '%s' console]\n", vmname); + + if (set_tty_rawmode(STDIN_FILENO) == -1) { + reset_tty(); + bcons_perror("failed to set stdin pty to raw mode"); + return (1); + } + + /* + * Run the I/O loop until we get disconnected. + */ + doio(); + reset_tty(); + (void) printf("\n[Connection to vm '%s' console closed]\n", vmname); + + return (0); +} diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile new file mode 100644 index 0000000000..c4f317a9fa --- /dev/null +++ b/usr/src/cmd/bhyveconsole/i386/Makefile @@ -0,0 +1,43 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +PROG= bhyveconsole + +OBJS= bhyveconsole.o + +SRCS= $(OBJS:%.o=../%.c) + +include ../../Makefile.cmd + +CFLAGS += $(CCVERBOSE) +LDLIBS += -lsocket + +.KEEP_STATE: + +%.o: ../%.c + $(COMPILE.c) $< + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) $(OBJS) -o $@ $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG32) + +clean: + $(RM) $(OBJS) + +include ../../Makefile.targ diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile new file mode 100644 index 0000000000..fe98204056 --- /dev/null +++ b/usr/src/cmd/bhyvectl/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +PROG = bhyvectl + +include ../Makefile.cmd + +$(BUILD64)SUBDIRS += $(MACH64) + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +.KEEP_STATE: + +all clean clobber lint: $(SUBDIRS) + +install: $(SUBDIRS) + -$(RM) $(ROOTUSRSBINPROG) + -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com new file mode 100644 index 0000000000..03ca34792c --- /dev/null +++ b/usr/src/cmd/bhyvectl/Makefile.com @@ -0,0 +1,48 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +PROG= bhyvectl + +SRCS = bhyvectl.c +OBJS = $(SRCS:.c=.o) + +include ../../Makefile.cmd + +.KEEP_STATE: + +CFLAGS += $(CCVERBOSE) +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ + -I$(ROOT)/usr/platform/i86pc/include \ + -I$(SRC)/uts/i86pc/io/vmm +LDLIBS += -lvmmapi + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) + +lint: lint_SRCS + +include ../../Makefile.targ + +%.o: ../%.c + $(COMPILE.c) -I$(SRC)/common $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyvectl/amd64/Makefile new file mode 100644 index 0000000000..b602c50d05 --- /dev/null +++ b/usr/src/cmd/bhyvectl/amd64/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.com +include ../../Makefile.cmd.64 + +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c new file mode 100644 index 0000000000..07d0a83df5 --- /dev/null +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -0,0 +1,1523 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/errno.h> +#include <sys/mman.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <libgen.h> +#include <libutil.h> +#include <fcntl.h> +#include <string.h> +#include <getopt.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "intel/vmcs.h" + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +static const char *progname; + +static void +usage(void) +{ + + (void)fprintf(stderr, + "Usage: %s --vm=<vmname>\n" + " [--cpu=<vcpu_number>]\n" + " [--create]\n" + " [--destroy]\n" + " [--get-all]\n" + " [--get-stats]\n" + " [--set-desc-ds]\n" + " [--get-desc-ds]\n" + " [--set-desc-es]\n" + " [--get-desc-es]\n" + " [--set-desc-gs]\n" + " [--get-desc-gs]\n" + " [--set-desc-fs]\n" + " [--get-desc-fs]\n" + " [--set-desc-cs]\n" + " [--get-desc-cs]\n" + " [--set-desc-ss]\n" + " [--get-desc-ss]\n" + " [--set-desc-tr]\n" + " [--get-desc-tr]\n" + " [--set-desc-ldtr]\n" + " [--get-desc-ldtr]\n" + " [--set-desc-gdtr]\n" + " [--get-desc-gdtr]\n" + " [--set-desc-idtr]\n" + " [--get-desc-idtr]\n" + " [--run]\n" + " [--capname=<capname>]\n" + " [--getcap]\n" + " [--setcap=<0|1>]\n" + " [--desc-base=<BASE>]\n" + " [--desc-limit=<LIMIT>]\n" + " [--desc-access=<ACCESS>]\n" + " [--set-cr0=<CR0>]\n" + " [--get-cr0]\n" + " [--set-cr3=<CR3>]\n" + " [--get-cr3]\n" + " [--set-cr4=<CR4>]\n" + " [--get-cr4]\n" + " [--set-dr7=<DR7>]\n" + " [--get-dr7]\n" + " [--set-rsp=<RSP>]\n" + " [--get-rsp]\n" + " [--set-rip=<RIP>]\n" + " [--get-rip]\n" + " [--get-rax]\n" + " [--set-rax=<RAX>]\n" + " [--get-rbx]\n" + " [--get-rcx]\n" + " [--get-rdx]\n" + " [--get-rsi]\n" + " [--get-rdi]\n" + " [--get-rbp]\n" + " [--get-r8]\n" + " [--get-r9]\n" + " [--get-r10]\n" + " [--get-r11]\n" + " [--get-r12]\n" + " [--get-r13]\n" + " [--get-r14]\n" + " [--get-r15]\n" + " [--set-rflags=<RFLAGS>]\n" + " [--get-rflags]\n" + " [--set-cs]\n" + " [--get-cs]\n" + " [--set-ds]\n" + " [--get-ds]\n" + " [--set-es]\n" + " [--get-es]\n" + " [--set-fs]\n" + " [--get-fs]\n" + " [--set-gs]\n" + " [--get-gs]\n" + " [--set-ss]\n" + " [--get-ss]\n" + " [--get-tr]\n" + " [--get-ldtr]\n" + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=<info>]\n" + " [--get-vmcs-eptp]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--set-vmcs-exception-bitmap]\n" + " [--get-vmcs-exception-bitmap]\n" + " [--get-vmcs-io-bitmap-address]\n" + " [--get-vmcs-tsc-offset]\n" + " [--get-vmcs-guest-pat]\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-msr-bitmap]\n" + " [--get-vmcs-msr-bitmap-address]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-ple-gap]\n" + " [--get-vmcs-ple-window]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-guest-sysenter]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-reason]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + " [--set-x2apic-state=<state>]\n" + " [--get-x2apic-state]\n" + " [--unassign-pptdev=<bus/slot/func>]\n" + " [--set-mem=<memory in units of MB>]\n" + " [--get-lowmem]\n" + " [--get-highmem]\n", + progname); + exit(1); +} + +static int get_stats, getcap, setcap, capval; +static const char *capname; +static int create, destroy, get_lowmem, get_highmem; +static uint64_t memsize; +static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_efer, get_efer; +static int set_dr7, get_dr7; +static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; +static int set_rax, get_rax; +static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; +static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; +static int set_desc_ds, get_desc_ds; +static int set_desc_es, get_desc_es; +static int set_desc_fs, get_desc_fs; +static int set_desc_gs, get_desc_gs; +static int set_desc_cs, get_desc_cs; +static int set_desc_ss, get_desc_ss; +static int set_desc_gdtr, get_desc_gdtr; +static int set_desc_idtr, get_desc_idtr; +static int set_desc_tr, get_desc_tr; +static int set_desc_ldtr, get_desc_ldtr; +static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; +static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; +static int set_x2apic_state, get_x2apic_state; +enum x2apic_state x2apic_state; +static int unassign_pptdev, bus, slot, func; +static int run; + +/* + * VMCS-specific fields + */ +static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; +static int get_eptp, get_io_bitmap, get_tsc_offset; +static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info; +static int get_vmcs_interruptibility; +uint32_t vmcs_entry_interruption_info; +static int get_vmcs_gpa, get_vmcs_gla; +static int get_exception_bitmap, set_exception_bitmap, exception_bitmap; +static int get_cr0_mask, get_cr0_shadow; +static int get_cr4_mask, get_cr4_shadow; +static int get_cr3_targets; +static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; +static int get_msr_bitmap, get_msr_bitmap_address; +static int get_vpid, get_ple_gap, get_ple_window; +static int get_inst_err, get_exit_ctls, get_entry_ctls; +static int get_host_cr0, get_host_cr3, get_host_cr4; +static int get_host_rip, get_host_rsp; +static int get_guest_pat, get_host_pat; +static int get_guest_sysenter, get_vmcs_link; +static int get_vmcs_exit_reason, get_vmcs_exit_qualification; +static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; + +static uint64_t desc_base; +static uint32_t desc_limit, desc_access; + +static int get_all; + +static void +dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) +{ + printf("vm exit[%d]\n", vcpu); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + switch (vmexit->exitcode) { + case VM_EXITCODE_INOUT: + printf("\treason\t\tINOUT\n"); + printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); + printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); + printf("\tflags\t\t%s%s\n", + vmexit->u.inout.string ? "STRING " : "", + vmexit->u.inout.rep ? "REP " : ""); + printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); + printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); + break; + case VM_EXITCODE_VMX: + printf("\treason\t\tVMX\n"); + printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); + printf("\texit_reason\t0x%08x (%u)\n", + vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); + printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); + break; + default: + printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); + break; + } +} + +static int +dump_vmcs_msr_bitmap(int vcpu, u_long addr) +{ + int error, fd, byte, bit, readable, writeable; + u_int msr; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) + goto done; + + bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr); + if (bitmap == MAP_FAILED) + goto done; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 8; + bit = msr & 0x7; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 1024; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", + 0xc0000000 + msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + } + + error = 0; +done: + if (bitmap != MAP_FAILED) + munmap((void *)bitmap, PAGE_SIZE); + if (fd >= 0) + close(fd); + return (error); +} + +static int +vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); +} + +static int +vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); +} + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ + VCPU, + SET_MEM, + SET_EFER, + SET_CR0, + SET_CR3, + SET_CR4, + SET_DR7, + SET_RSP, + SET_RIP, + SET_RAX, + SET_RFLAGS, + DESC_BASE, + DESC_LIMIT, + DESC_ACCESS, + SET_CS, + SET_DS, + SET_ES, + SET_FS, + SET_GS, + SET_SS, + SET_TR, + SET_LDTR, + SET_X2APIC_STATE, + SET_VMCS_EXCEPTION_BITMAP, + SET_VMCS_ENTRY_INTERRUPTION_INFO, + SET_CAP, + CAPNAME, + UNASSIGN_PPTDEV, +}; + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu; + vm_paddr_t gpa; + size_t len; + struct vm_exit vmexit; + uint64_t ctl, eptp, bm, addr, u64; + struct vmctx *ctx; + int wired; + + uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; + uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64_t r8, r9, r10, r11, r12, r13, r14, r15; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + + struct option opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-mem", REQ_ARG, 0, SET_MEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, + { "set-vmcs-exception-bitmap", + REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-lowmem", NO_ARG, &get_lowmem, 1 }, + { "get-highmem",NO_ARG, &get_highmem, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 }, + { "get-vmcs-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-vmcs-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 }, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1}, + { "get-vmcs-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-vmcs-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1}, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-vmcs-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 }, + { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 }, + { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-vmcs-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-exit-reason", + NO_ARG, &get_vmcs_exit_reason, 1 }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1}, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1}, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 }, + { "get-all", NO_ARG, &get_all, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { NULL, 0, NULL, 0 } + }; + + vcpu = 0; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_MEM: + memsize = atoi(optarg) * MB; + memsize = roundup(memsize, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_X2APIC_STATE: + x2apic_state = strtol(optarg, NULL, 0); + set_x2apic_state = 1; + break; + case SET_VMCS_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case CAPNAME: + capname = optarg; + break; + case UNASSIGN_PPTDEV: + unassign_pptdev = 1; + if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) + usage(); + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) + error = -1; + } + + if (!error && memsize) + error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_x2apic_state) + error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); + +#ifdef __FreeBSD__ + if (!error && unassign_pptdev) + error = vm_unassign_pptdev(ctx, bus, slot, func); +#endif + + if (!error && set_exception_bitmap) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + exception_bitmap); + } + + if (!error && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && (get_lowmem || get_all)) { + gpa = 0; + error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (error == 0) + printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); + } + + if (!error && (get_highmem || get_all)) { + gpa = 4 * GB; + error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (error == 0) + printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); + } + + if (!error && (get_efer || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + } + + if (!error && (get_cr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && (get_cr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && (get_cr4 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (error == 0) + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && (get_dr7 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); + if (error == 0) + printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); + } + + if (!error && (get_rsp || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); + if (error == 0) + printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && (get_rip || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + if (error == 0) + printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && (get_rax || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); + if (error == 0) + printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); + } + + if (!error && (get_rbx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); + if (error == 0) + printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); + } + + if (!error && (get_rcx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); + if (error == 0) + printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); + } + + if (!error && (get_rdx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); + if (error == 0) + printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); + } + + if (!error && (get_rsi || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); + if (error == 0) + printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); + } + + if (!error && (get_rdi || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); + if (error == 0) + printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); + } + + if (!error && (get_rbp || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); + if (error == 0) + printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); + } + + if (!error && (get_r8 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); + if (error == 0) + printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); + } + + if (!error && (get_r9 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); + if (error == 0) + printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); + } + + if (!error && (get_r10 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); + if (error == 0) + printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); + } + + if (!error && (get_r11 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); + if (error == 0) + printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); + } + + if (!error && (get_r12 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); + if (error == 0) + printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); + } + + if (!error && (get_r13 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); + if (error == 0) + printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); + } + + if (!error && (get_r14 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); + if (error == 0) + printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); + } + + if (!error && (get_r15 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); + if (error == 0) + printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); + } + + if (!error && (get_rflags || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error == 0) + printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); + } + +#ifdef __FreeBSD__ + if (!error && (get_stats || get_all)) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-40s\t%ld\n", desc, stats[i]); + } + } + } +#endif + + if (!error && (get_desc_ds || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_es || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_fs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_gs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_ss || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_cs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_tr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_ldtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_gdtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && (get_desc_idtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("idtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && (get_cs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); + if (error == 0) + printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); + } + + if (!error && (get_ds || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); + if (error == 0) + printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); + } + + if (!error && (get_es || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); + if (error == 0) + printf("es[%d]\t\t0x%04lx\n", vcpu, es); + } + + if (!error && (get_fs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); + if (error == 0) + printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); + } + + if (!error && (get_gs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); + if (error == 0) + printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); + } + + if (!error && (get_ss || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); + if (error == 0) + printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); + } + + if (!error && (get_tr || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); + if (error == 0) + printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); + } + + if (!error && (get_ldtr || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); + if (error == 0) + printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); + } + + if (!error && (get_x2apic_state || get_all)) { + error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); + if (error == 0) + printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); + } + + if (!error && (get_pinbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); + if (error == 0) + printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && (get_procbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && (get_procbased_ctls2 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl); + } + + if (!error && (get_vmcs_gla || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); + if (error == 0) + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && (get_vmcs_gpa || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); + if (error == 0) + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && (get_vmcs_entry_interruption_info || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && (get_eptp || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + if (error == 0) + printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp); + } + + if (!error && (get_exception_bitmap || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + &bm); + if (error == 0) + printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm); + } + + if (!error && (get_io_bitmap || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm); + } + + if (!error && (get_tsc_offset || get_all)) { + uint64_t tscoff; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + } + + if (!error && (get_cr0_mask || get_all)) { + uint64_t cr0mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); + if (error == 0) + printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask); + } + + if (!error && (get_cr0_shadow || get_all)) { + uint64_t cr0shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, + &cr0shadow); + if (error == 0) + printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow); + } + + if (!error && (get_cr4_mask || get_all)) { + uint64_t cr4mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); + if (error == 0) + printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask); + } + + if (!error && (get_cr4_shadow || get_all)) { + uint64_t cr4shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, + &cr4shadow); + if (error == 0) + printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow); + } + + if (!error && (get_cr3_targets || get_all)) { + uint64_t target_count, target_addr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, + &target_count); + if (error == 0) { + printf("cr3_target_count[%d]\t0x%08lx\n", + vcpu, target_count); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, + &target_addr); + if (error == 0) { + printf("cr3_target0[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, + &target_addr); + if (error == 0) { + printf("cr3_target1[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, + &target_addr); + if (error == 0) { + printf("cr3_target2[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, + &target_addr); + if (error == 0) { + printf("cr3_target3[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + } + + if (!error && (get_apic_access_addr || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr); + if (error == 0) + printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_virtual_apic_addr || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr); + if (error == 0) + printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_tpr_threshold || get_all)) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); + if (error == 0) + printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold); + } + + if (!error && (get_msr_bitmap_address || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_msr_bitmap || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + error = dump_vmcs_msr_bitmap(vcpu, addr); + } + + if (!error && (get_vpid || get_all)) { + uint64_t vpid; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + if (error == 0) + printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid); + } + + if (!error && (get_ple_window || get_all)) { + uint64_t window; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window); + if (error == 0) + printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window); + } + + if (!error && (get_ple_gap || get_all)) { + uint64_t gap; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap); + if (error == 0) + printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap); + } + + if (!error && (get_inst_err || get_all)) { + uint64_t insterr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, + &insterr); + if (error == 0) { + printf("instruction_error[%d]\t0x%08lx\n", + vcpu, insterr); + } + } + + if (!error && (get_exit_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); + if (error == 0) + printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + } + + if (!error && (get_entry_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); + if (error == 0) + printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + } + + if (!error && (get_host_pat || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); + if (error == 0) + printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_guest_pat || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_host_cr0 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); + if (error == 0) + printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && (get_host_cr3 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); + if (error == 0) + printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && (get_host_cr4 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); + if (error == 0) + printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && (get_host_rip || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && (get_host_rsp || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); + if (error == 0) + printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && (get_guest_sysenter || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, &cs); + if (error == 0) + printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs); + + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); + if (error == 0) + printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp); + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip); + } + + if (!error && (get_vmcs_link || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); + if (error == 0) + printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_vmcs_exit_reason || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); + if (error == 0) + printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64); + } + + if (!error && (get_vmcs_exit_qualification || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", + vcpu, u64); + } + + if (!error && (get_vmcs_exit_interruption_info || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_exit_interruption_error || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, + &u64); + if (error == 0) { + printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_interruptibility || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_INTERRUPTIBILITY, &u64); + if (error == 0) { + printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n", + vcpu, u64); + } + } + + if (!error && setcap) { + int captype; + captype = vm_capability_name2type(capname); + error = vm_set_capability(ctx, vcpu, captype, capval); + if (error != 0 && errno == ENOENT) + printf("Capability \"%s\" is not available\n", capname); + } + + if (!error && (getcap || get_all)) { + int captype, val, getcaptype; + + if (getcap && capname) + getcaptype = vm_capability_name2type(capname); + else + getcaptype = -1; + + for (captype = 0; captype < VM_CAP_MAX; captype++) { + if (getcaptype >= 0 && captype != getcaptype) + continue; + error = vm_get_capability(ctx, vcpu, captype, &val); + if (error == 0) { + printf("Capability \"%s\" is %s on vcpu %d\n", + vm_capability_type2name(captype), + val ? "set" : "not set", vcpu); + } else if (errno == ENOENT) { + error = 0; + printf("Capability \"%s\" is not available\n", + vm_capability_type2name(captype)); + } else { + break; + } + } + } + + if (!error && run) { + error = vm_run(ctx, vcpu, &vmexit); + if (error == 0) + dump_vm_run_exitcode(&vmexit, vcpu); + else + printf("vm_run error %d\n", error); + } + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + error = vm_destroy(ctx); + + exit(error); +} diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile new file mode 100644 index 0000000000..bbcbacf32f --- /dev/null +++ b/usr/src/cmd/bhyveload-uefi/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +PROG = bhyveload-uefi + +include ../Makefile.cmd + +$(BUILD64)SUBDIRS += $(MACH64) + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +.KEEP_STATE: + +all clean clobber lint: $(SUBDIRS) + +install: $(SUBDIRS) + -$(RM) $(ROOTUSRSBINPROG) + -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com new file mode 100644 index 0000000000..7865cca8d8 --- /dev/null +++ b/usr/src/cmd/bhyveload-uefi/Makefile.com @@ -0,0 +1,52 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +PROG= bhyveload-uefi + +SRCS = ../bhyveload-uefi.c expand_number.c +OBJS = bhyveload-uefi.o expand_number.o + +include ../../Makefile.cmd + +.KEEP_STATE: + +CFLAGS += $(CCVERBOSE) +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ + -I$(ROOT)/usr/platform/i86pc/include +LDLIBS += -lvmmapi + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) + +lint: lint_SRCS + +include ../../Makefile.targ + +%.o: ../%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +%.o: $(CONTRIB)/freebsd/lib/libutil/%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyveload-uefi/amd64/Makefile new file mode 100644 index 0000000000..b602c50d05 --- /dev/null +++ b/usr/src/cmd/bhyveload-uefi/amd64/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.com +include ../../Makefile.cmd.64 + +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c new file mode 100644 index 0000000000..62a7ca5d0f --- /dev/null +++ b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c @@ -0,0 +1,190 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/types.h> + +#include <machine/vmm.h> + +#include <errno.h> +#include <err.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sysexits.h> +#include <unistd.h> + +#include <vmmapi.h> + +#define KB (1024UL) +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +#define UEFI_ROM_ADDR 0xFFE00000 +#define UEFI_ROM_SIZE (2 * MB) +/* + * N.B. the UEFI code zeros the first page in memory so use the second. + */ +#define BHYVE_HOB_ADDR 0x00002000 +#define BHYVE_BO_HOB_ADDR 0x00002080 + +#define UEFI_ROM_PATH "/usr/share/bhyve/uefi-rom.bin" + +struct platform_info { + uint32_t ncpus; +}; + +/* + * Boot order code: + * 0 - EFI_CD_HD + * 1 - EFI_CD + * 2 - EFI_HD_CD + * 3 - EFI_HD + * 4 - EFI_NET + * 5 - EFI_NET_CD_HD + * 6 - EFI_HD_HD_CD + * 7 - LEGACY_CD_HD + * 8 - LEGACY_CD + * 9 - LEGACY_HD_CD + * 10 - LEGACY_HD + * 11 - EFI_SHELL + */ + +struct bootorder_info { + uint32_t guestbootorder; +}; + +static char *vmname, *progname; +static struct vmctx *ctx; + +static void +usage(void) +{ + printf("usage: %s " + "[-c vcpus] [-m mem-size] [-b bootorder]" + "<vmname>\n", progname); + exit(1); +} + +int +main(int argc, char** argv) +{ + int opt, error, fd; + int guest_ncpus; + int guest_bootorder = 0; + uint64_t mem_size; + char *membase, *rombase; + struct platform_info *pi; + struct bootorder_info *bi; + + progname = argv[0]; + + guest_ncpus = 1; + mem_size = 256 * MB; + + while ((opt = getopt(argc, argv, "c:m:b:")) != -1) { + switch (opt) { + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error != 0 || mem_size == 0) + errx(EX_USAGE, "Invalid memsize '%s'", optarg); + break; + case 'b': + guest_bootorder = atoi(optarg); + if (guest_bootorder < 0 || guest_bootorder > 11) { + errx(EX_USAGE, "Invalid bootoption: %d\n" + "\tBoot order code:\n" + "\t0 - EFI_CD_HD\n" + "\t1 - EFI_CD\n" + "\t2 - EFI_HD_CD\n" + "\t3 - EFI_HD\n" + "\t4 - EFI_NET\n" + "\t5 - EFI_NET_CD_HD\n" + "\t6 - EFI_HD_HD_CD\n" + "\t7 - LEGACY_CD_HD\n" + "\t8 - LEGACY_CD\n" + "\t9 - LEGACY_HD_CD\n" + "\t10 - LEGACY_HD\n" + "\t11 - EFI_SHELL\n", guest_bootorder); + exit(1); + } + break; + case '?': + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + + vmname = argv[0]; + error = vm_create(vmname); + if (error != 0 && errno != EEXIST) { + perror("vm_create"); + exit(1); + + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1); + if (error) { + perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)"); + } + + error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL); + if (error) { + perror("vm_setup_memory"); + exit(1); + } + membase = vm_map_gpa(ctx, 0, 8 * KB); + + error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); + if (error) { + perror("vm_setup_rom"); + exit(1); + } + rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); + + fd = open(UEFI_ROM_PATH, O_RDONLY); + if (fd == -1) { + perror("open"); + exit(1); + } + read(fd, rombase, UEFI_ROM_SIZE); + close(fd); + + pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR); + pi->ncpus = guest_ncpus; + bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR); + bi->guestbootorder = guest_bootorder; + + error = vcpu_reset(ctx, 0); + if (error) { + perror("vcpu_reset"); + exit(1); + } + + return (0); +} diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyveload-uefi/i386/Makefile new file mode 100644 index 0000000000..f5b7bb6915 --- /dev/null +++ b/usr/src/cmd/bhyveload-uefi/i386/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.com + +install: all $(ROOTUSRSBINPROG32) diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile new file mode 100644 index 0000000000..bf9219b435 --- /dev/null +++ b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile @@ -0,0 +1,20 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2014 Pluribus Networks Inc. +# + +MAKEVARS = CW_NO_SHADOW=true __GNUC= + +include $(SRC)/Makefile.master +$(BUILD64)SUBDIRS += $(MACH64) +include ../../../Makefile.subdirs diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile new file mode 100644 index 0000000000..49ca0c5eb3 --- /dev/null +++ b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile @@ -0,0 +1,32 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +MODULE = vmm.so +MDBTGT = kvm + +MODSRCS = vmm.c + +include ../../../../../Makefile.cmd +include ../../../../../Makefile.cmd.64 +include ../../../Makefile.amd64 +include ../../../../Makefile.module + +CPPFLAGS = -D_KERNEL -D_MACHDEP +CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 +CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 +CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc +CPPFLAGS += -I$(SRC)/cmd/mdb/common + +CPPFLAGS += -_cc=-xdryrun diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c new file mode 100644 index 0000000000..9e29d8662a --- /dev/null +++ b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c @@ -0,0 +1,238 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/param.h> + +#include <mdb/mdb_modapi.h> +#include <sys/cpuvar.h> +#include <sys/varargs.h> +#include <sys/vmm.h> +#include <sys/vmm_impl.h> + +/* + * VMM trace debug walker/dcmd code + */ + +/* + * Initialize the vmm_trace_dmsg_t walker by either using the given starting + * address, or reading the value of the kernel's vmm_debug_rbuf pointer. + * We also allocate a vmm_trace_dmsg_t for storage, and save this using the + * walk_data pointer. + */ +static int +vmm_dmsg_walk_i(mdb_walk_state_t *wsp) +{ + uintptr_t rbuf_addr; + vmm_trace_rbuf_t rbuf; + + if (wsp->walk_addr == NULL) { + if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) { + mdb_warn("failed to read 'vmm_debug_rbuf'"); + return (WALK_ERR); + } + + if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr) + == -1) { + mdb_warn("failed to read vmm_trace_rbuf_t at %p", + rbuf_addr); + return (WALK_ERR); + } + + wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh; + } + + /* + * Save ptr to head of ring buffer to prevent looping. + */ + wsp->walk_arg = (void *)wsp->walk_addr; + wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP); + return (WALK_NEXT); +} + +/* + * At each step, read a vmm_trace_dmsg_t into our private storage, and then + * invoke the callback function. We terminate when we reach a NULL next + * pointer. + */ +static int +vmm_dmsg_walk_s(mdb_walk_state_t *wsp) +{ + int status; + + if (wsp->walk_addr == NULL) + return (WALK_DONE); + + if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t), + wsp->walk_addr) == -1) { + mdb_warn("failed to read vmm_trace_dmsg_t at %p", + wsp->walk_addr); + return (WALK_ERR); + } + + status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data, + wsp->walk_cbdata); + + wsp->walk_addr = + (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next); + + /* + * If we've looped then we're done. + */ + if (wsp->walk_addr == (uintptr_t)wsp->walk_arg) + wsp->walk_addr = NULL; + + return (status); +} + +/* + * The walker's fini function is invoked at the end of each walk. Since we + * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must + * free it now. + */ +static void +vmm_dmsg_walk_f(mdb_walk_state_t *wsp) +{ + mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t)); +} + +/* + * This routine is used by the vmm_dmsg_dump dcmd to dump content of + * VMM trace ring buffer. + */ +int +vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed) +{ + vmm_trace_dmsg_t dmsg, *dmsgh = addr; + char pathname[MAXPATHLEN]; + char merge[1024]; + + while (addr != NULL) { + if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) != + sizeof (dmsg)) { + mdb_warn("failed to read message pointer in kernel"); + return (DCMD_ERR); + } + + (void) mdb_snprintf(merge, sizeof (merge), + "[%Y:%03d:%03d:%03d] : %s", + dmsg.timestamp.tv_sec, + (int)dmsg.timestamp.tv_nsec/1000000, + (int)(dmsg.timestamp.tv_nsec/1000)%1000, + (int)dmsg.timestamp.tv_nsec%1000, + dmsg.buf); + + mdb_printf("%s", merge); + + if (printed != NULL) { + (*printed)++; + } + + if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) { + break; + } + } + + return (DCMD_OK); +} + +/* + * 1. Process flag passed to vmm_dmsg_dump dcmd. + * 2. Obtain VMM trace ring buffer pointer. + * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump() + * to dump content of VMM trace ring buffer. + */ +int +vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + vmm_trace_rbuf_t rbuf; + uint_t printed = 0; /* have we printed anything? */ + int print_pathname = FALSE; + int rval = DCMD_OK; + + if (argc > 1) { + return (DCMD_USAGE); + } + + if (mdb_getopts(argc, argv, + 'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) { + return (DCMD_USAGE); + } + + /* + * If ring buffer address not provided try to obtain + * it using vmm_debug_rbuf global. + */ + if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) { + if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) { + mdb_warn("Failed to read 'vmm_debug_rbuf'."); + return (DCMD_ERR); + } + } + + if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) { + mdb_warn("Failed to read ring buffer in kernel."); + return (DCMD_ERR); + } + + if (rbuf.dmsgh == NULL) { + mdb_printf("The vmm trace ring buffer is empty.\n"); + return (DCMD_OK); + } + + rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh, + print_pathname, &printed); + + if (rval != DCMD_OK) { + return (rval); + } + + if (printed == 0) { + mdb_warn("Failed to read vmm trace ring buffer."); + return (DCMD_ERR); + } + + return (rval); +} + +/* + * MDB module linkage information: + * + * We declare a list of structures describing our dcmds, a list of structures + * describing our walkers, and a function named _mdb_init to return a pointer + * to our module information. + */ + +static const mdb_dcmd_t dcmds[] = { + { "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages", + vmm_rbuf_dump }, + { NULL } +}; + +static const mdb_walker_t walkers[] = { + { "vmm_dmsg", + "walk ring buffer containing vmm trace debug messages", + vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f }, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { + MDB_API_VERSION, dcmds, walkers +}; + +const mdb_modinfo_t * +_mdb_init(void) +{ + return (&modinfo); +} diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h new file mode 100644 index 0000000000..fcf35a7b78 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ + +#define ENTRY(x) \ + .text; .p2align 4,0x90; \ + .globl x; \ + .type x, @function; \ +x: + +#define END(x) \ + .size x, [.-x] + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h new file mode 100644 index 0000000000..5b78143d21 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/atomic.h @@ -0,0 +1,244 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ + +static __inline u_char +atomic_load_acq_char(volatile u_char *p) +{ + u_char res; + + __asm volatile("lock ; " "cmpxchgb %b0,%1" + : "=a" (res), "=m" (*p) + : "m" (*p) : "memory", "cc"); + return (res); +} + +static __inline u_short +atomic_load_acq_short(volatile u_short *p) +{ + u_short res; + + __asm volatile("lock ; " "cmpxchgw %w0,%1" + : "=a" (res), "=m" (*p) + : "m" (*p) + : "memory", "cc"); + return (res); +} + +static __inline u_int +atomic_load_acq_int(volatile u_int *p) +{ + u_int res; + + __asm volatile("lock ; " "cmpxchgl %0,%1" + : "=a" (res), "=m" (*p) + : "m" (*p) + : "memory", "cc"); + return (res); +} + +static __inline u_long +atomic_load_acq_long(volatile u_long *p) +{ + u_long res; + + __asm volatile("lock ; " "cmpxchgq %0,%1" + : "=a" (res), "=m" (*p) + : "m" (*p) + : "memory", "cc"); + return (res); +} + +static __inline void +atomic_store_rel_char(volatile u_char *p, u_char v) +{ + __asm volatile("" : : : "memory"); + *p = v; +} + +static __inline void +atomic_store_rel_short(volatile u_short *p, u_short v) +{ + __asm volatile("" : : : "memory"); + *p = v; +} + +static __inline void +atomic_store_rel_int(volatile u_int *p, u_int v) +{ + __asm volatile("" : : : "memory"); + *p = v; +} + +static __inline void +atomic_store_rel_long(volatile u_long *p, u_long v) +{ + __asm volatile("" : : : "memory"); + *p = v; +} + +/* + * Atomic compare and set. + * + * if (*dst == expect) *dst = src (all 32 bit words) + * + * Returns 0 on failure, non-zero on success + */ +static __inline int +atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src) +{ + u_char res; + + __asm __volatile( + " lock ; " + " cmpxchgl %3,%1 ; " + " sete %0 ; " + "# atomic_cmpset_int" + : "=q" (res), /* 0 */ + "+m" (*dst), /* 1 */ + "+a" (expect) /* 2 */ + : "r" (src) /* 3 */ + : "memory", "cc"); + return (res); +} + +static __inline int +atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) +{ + u_char res; + + __asm __volatile( + " lock ; " + " cmpxchgq %3,%1 ; " + " sete %0 ; " + "# atomic_cmpset_long" + : "=q" (res), /* 0 */ + "+m" (*dst), /* 1 */ + "+a" (expect) /* 2 */ + : "r" (src) /* 3 */ + : "memory", "cc"); + return (res); +} + +/* + * Atomically add the value of v to the integer pointed to by p and return + * the previous value of *p. + */ +static __inline u_int +atomic_fetchadd_int(volatile u_int *p, u_int v) +{ + + __asm __volatile( + " lock ; " + " xaddl %0, %1 ; " + "# atomic_fetchadd_int" + : "+r" (v), /* 0 (result) */ + "=m" (*p) /* 1 */ + : "m" (*p) /* 2 */ + : "cc"); + return (v); +} + +static __inline void +atomic_set_int(volatile u_int *p, u_int v) +{ + __asm volatile( + "lock ; " "orl %1,%0" + : "=m" (*p) + : "ir" (v), "m" (*p) + : "cc"); +} + +static __inline void +atomic_clear_int(volatile u_int *p, u_int v) +{ + __asm volatile( + "lock ; " "andl %1,%0" + : "=m" (*p) + : "ir" (~v), "m" (*p) + : "cc"); +} + +static __inline void +atomic_subtract_int(volatile u_int *p, u_int v) +{ + __asm volatile( + "lock ; " "subl %1,%0" + : "=m" (*p) + : "ir" (v), "m" (*p) + : "cc"); +} + +static __inline void +atomic_set_long(volatile u_long *p, u_long v) +{ + __asm volatile( + "lock ; " "orq %1,%0" + : "+m" (*p) + : "ir" (v) + : "cc"); +} + +static __inline void +atomic_clear_long(volatile u_long *p, u_long v) +{ + __asm volatile("lock ; " "andq %1,%0" + : "+m" (*p) + : "ir" (~v) + : "cc"); +} + +static __inline u_int +atomic_swap_int(volatile u_int *p, u_int v) +{ + + __asm __volatile( + " xchgl %1,%0 ; " + "# atomic_swap_int" + : "+r" (v), /* 0 */ + "+m" (*p)); /* 1 */ + return (v); +} + +static __inline u_long +atomic_swap_long(volatile u_long *p, u_long v) +{ + + __asm __volatile( + " xchgq %1,%0 ; " + "# atomic_swap_long" + : "+r" (v), /* 0 */ + "+m" (*p)); /* 1 */ + return (v); +} + +#define atomic_readandclear_int(p) atomic_swap_int(p, 0) +#define atomic_readandclear_long(p) atomic_swap_long(p, 0) + +/* Operations on 32-bit double words. */ +#define atomic_load_acq_32 atomic_load_acq_int +#define atomic_store_rel_32 atomic_store_rel_int +#define atomic_cmpset_32 atomic_cmpset_int + +/* Operations on 64-bit quad words. */ +#define atomic_cmpset_64 atomic_cmpset_long +#define atomic_readandclear_64 atomic_readandclear_long + +/* Operations on pointers. */ +#define atomic_cmpset_ptr atomic_cmpset_long + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/clock.h b/usr/src/compat/freebsd/amd64/machine/clock.h new file mode 100644 index 0000000000..f50b42a126 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/clock.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_ + +extern uint64_t cpu_freq_hz; + +#define tsc_freq cpu_freq_hz + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h new file mode 100644 index 0000000000..cf485e947c --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h @@ -0,0 +1,165 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ + +static __inline u_long +bsfq(u_long mask) +{ + u_long result; + + __asm __volatile("bsfq %1,%0" : "=r" (result) : "rm" (mask)); + return (result); +} + +static __inline u_int +bsrl(u_int mask) +{ + u_int result; + + __asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask)); + return (result); +} + +static __inline u_long +bsrq(u_long mask) +{ + u_long result; + + __asm __volatile("bsrq %1,%0" : "=r" (result) : "rm" (mask)); + return (result); +} + +static __inline void +clts(void) +{ + __asm __volatile("clts"); +} + +static __inline void +do_cpuid(u_int ax, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} + +static __inline void +cpuid_count(u_int ax, u_int cx, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax), "c" (cx)); +} + +static __inline void +enable_intr(void) +{ + __asm __volatile("sti"); +} + +static __inline int +ffsl(long mask) +{ + return (mask == 0 ? mask : (int)bsfq((u_long)mask) + 1); +} + +static __inline int +fls(int mask) +{ + return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1); +} + +static __inline int +flsl(long mask) +{ + return (mask == 0 ? mask : (int)bsrq((u_long)mask) + 1); +} + +static __inline int +flsll(long long mask) +{ + return (flsl((long)mask)); +} + +static __inline uint64_t +rdmsr(u_int msr) +{ + uint32_t low, high; + + __asm __volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr)); + return (low | ((uint64_t)high << 32)); +} + +static __inline uint64_t +rdtsc(void) +{ + uint32_t low, high; + + __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); + return (low | ((uint64_t)high << 32)); +} + +static __inline void +wrmsr(u_int msr, uint64_t newval) +{ + uint32_t low, high; + + low = newval; + high = newval >> 32; + __asm __volatile("wrmsr" : : "a" (low), "d" (high), "c" (msr)); +} + +static __inline void +load_cr0(u_long data) +{ + __asm __volatile("movq %0,%%cr0" : : "r" (data)); +} + +static __inline u_long +rcr0(void) +{ + u_long data; + + __asm __volatile("movq %%cr0,%0" : "=r" (data)); + return (data); +} + +static __inline u_long +rcr3(void) +{ + u_long data; + + __asm __volatile("movq %%cr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr4(u_long data) +{ + __asm __volatile("movq %0,%%cr4" : : "r" (data)); +} + +static __inline u_long +rcr4(void) +{ + u_long data; + + __asm __volatile("movq %%cr4,%0" : "=r" (data)); + return (data); +} + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h new file mode 100644 index 0000000000..48e686780c --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/fpu.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ + +#define XSAVE_AREA_ALIGN 64 + +void fpuexit(kthread_t *td); +void fpurestore(void *); +void fpusave(void *); + +struct savefpu *fpu_save_area_alloc(void); +void fpu_save_area_free(struct savefpu *fsa); +void fpu_save_area_reset(struct savefpu *fsa); + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h new file mode 100644 index 0000000000..60fdd566e5 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/md_var.h @@ -0,0 +1,24 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ + +extern u_int cpu_high; /* Highest arg to CPUID */ +extern u_int cpu_exthigh; /* Highest arg to extended CPUID */ +extern u_int cpu_id; /* Stepping ID */ +extern char cpu_vendor[]; /* CPU Origin code */ + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h new file mode 100644 index 0000000000..eaca5ab8d7 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/param.h @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ + +#ifdef _KERNEL +#define MAXCPU NCPU +#endif /* _KERNEL */ + +#define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */ +#define PAGE_SIZE (1<<PAGE_SHIFT) /* bytes/page */ +#define PAGE_MASK (PAGE_SIZE-1) + +/* Size of the level 1 page table units */ +#define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) + +/* Size of the level 2 page directory units */ +#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) + +/* Size of the level 3 page directory pointer table units */ +#define NPDPEPG (PAGE_SIZE/(sizeof (pdp_entry_t))) + +/* Size of the level 4 page-map level-4 table units */ +#define NPML4EPG (PAGE_SIZE/(sizeof (pml4_entry_t))) + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/pcb.h b/usr/src/compat/freebsd/amd64/machine/pcb.h new file mode 100644 index 0000000000..75b5de640c --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/pcb.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ + +#include <machine/fpu.h> + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h new file mode 100644 index 0000000000..d0303bdd56 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/pmap.h @@ -0,0 +1,44 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ + + /* ---- Intel Nomenclature ---- */ +#define PG_V 0x001 /* P Valid */ +#define PG_RW 0x002 /* R/W Read/Write */ +#define PG_U 0x004 /* U/S User/Supervisor */ +#define PG_A 0x020 /* A Accessed */ +#define PG_M 0x040 /* D Dirty */ +#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ + +/* + * Page Protection Exception bits + */ +#define PGEX_P 0x01 /* Protection violation vs. not present */ +#define PGEX_W 0x02 /* during a Write cycle */ +#define PGEX_U 0x04 /* access from User mode (UPL) */ +#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ +#define PGEX_I 0x10 /* during an instruction fetch */ + +typedef u_int64_t pd_entry_t; +typedef u_int64_t pt_entry_t; +typedef u_int64_t pdp_entry_t; +typedef u_int64_t pml4_entry_t; + +#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) +vm_paddr_t pmap_kextract(vm_offset_t va); + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/segments.h b/usr/src/compat/freebsd/amd64/machine/segments.h new file mode 100644 index 0000000000..d0655f4a0e --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/segments.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ + +#include <x86/segments.h> + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h new file mode 100644 index 0000000000..ef719b9684 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/smp.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h new file mode 100644 index 0000000000..79c3ec959e --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/vmm.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ + +#include <sys/vmm.h> + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_dev.h b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h new file mode 100644 index 0000000000..fe9cb6c705 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_ + +#include <sys/vmm_dev.h> + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h new file mode 100644 index 0000000000..02c3f391c7 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_ + +#include <sys/vmm_instruction_emul.h> + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h new file mode 100644 index 0000000000..c80c2af545 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */ diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h new file mode 100644 index 0000000000..e22ffc0551 --- /dev/null +++ b/usr/src/compat/freebsd/libutil.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_LIBUTIL_H_ +#define _COMPAT_FREEBSD_LIBUTIL_H_ + +int expand_number(const char *_buf, uint64_t *_num); + +#endif /* _COMPAT_FREEBSD_LIBUTIL_H_ */ diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h new file mode 100644 index 0000000000..a0d5a828c6 --- /dev/null +++ b/usr/src/compat/freebsd/net/ethernet.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ +#define _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ + +#include <sys/ethernet.h> + +#endif /* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */ diff --git a/usr/src/compat/freebsd/paths.h b/usr/src/compat/freebsd/paths.h new file mode 100644 index 0000000000..e43c963f93 --- /dev/null +++ b/usr/src/compat/freebsd/paths.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_PATHS_H_ +#define _COMPAT_FREEBSD_PATHS_H_ + +#define _PATH_TMP "/tmp/" + +#endif /* _COMPAT_FREEBSD_PATHS_H_ */ diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h new file mode 100644 index 0000000000..641c58f406 --- /dev/null +++ b/usr/src/compat/freebsd/pthread_np.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_ +#define _COMPAT_FREEBSD_PTHREAD_NP_H_ + +#include <sys/param.h> +#include <sys/cpuset.h> + +#include <synch.h> + +#define pthread_set_name_np(thread, name) + +#define pthread_mutex_isowned_np(x) _mutex_held(x) + +#endif /* _COMPAT_FREEBSD_PTHREAD_NP_H_ */ diff --git a/usr/src/compat/freebsd/string.h b/usr/src/compat/freebsd/string.h new file mode 100644 index 0000000000..7e0f5c7ddc --- /dev/null +++ b/usr/src/compat/freebsd/string.h @@ -0,0 +1,26 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_STRING_H_ +#define _COMPAT_FREEBSD_STRING_H_ + +/* + * This is quite a hack; blame bcopy/bcmp/bzero and memcpy/memcmp/memset. + */ +#include <strings.h> + +#include_next <string.h> + +#endif /* _COMPAT_FREEBSD_STRING_H_ */ diff --git a/usr/src/compat/freebsd/strings.h b/usr/src/compat/freebsd/strings.h new file mode 100644 index 0000000000..fa3539fb96 --- /dev/null +++ b/usr/src/compat/freebsd/strings.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_STRINGS_H_ +#define _COMPAT_FREEBSD_STRINGS_H_ + +#include <machine/cpufunc.h> + +#include_next <strings.h> + +#endif /* _COMPAT_FREEBSD_STRINGS_H_ */ diff --git a/usr/src/compat/freebsd/sys/_iovec.h b/usr/src/compat/freebsd/sys/_iovec.h new file mode 100644 index 0000000000..b755ae7e21 --- /dev/null +++ b/usr/src/compat/freebsd/sys/_iovec.h @@ -0,0 +1,24 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS__IOVEC_H_ +#define _COMPAT_FREEBSD_SYS__IOVEC_H_ + +struct iovec { + void *iov_base; /* Base address. */ + size_t iov_len; /* Length. */ +}; + +#endif /* _COMPAT_FREEBSD_SYS__IOVEC_H_ */ diff --git a/usr/src/compat/freebsd/sys/_pthreadtypes.h b/usr/src/compat/freebsd/sys/_pthreadtypes.h new file mode 100644 index 0000000000..d746da3712 --- /dev/null +++ b/usr/src/compat/freebsd/sys/_pthreadtypes.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_ +#define _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_ + +#endif /* _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_ */ diff --git a/usr/src/compat/freebsd/sys/_types.h b/usr/src/compat/freebsd/sys/_types.h new file mode 100644 index 0000000000..62c327d216 --- /dev/null +++ b/usr/src/compat/freebsd/sys/_types.h @@ -0,0 +1,22 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS__TYPES_H_ +#define _COMPAT_FREEBSD_SYS__TYPES_H_ + +#include <sys/cdefs.h> +#include <machine/_types.h> + +#endif /* _COMPAT_FREEBSD_SYS__TYPES_H_ */ diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h new file mode 100644 index 0000000000..17b6e31507 --- /dev/null +++ b/usr/src/compat/freebsd/sys/callout.h @@ -0,0 +1,70 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_ +#define _COMPAT_FREEBSD_SYS_CALLOUT_H_ + +#include <sys/cyclic.h> + +struct callout { + cyclic_id_t c_cyc_id; + int c_flags; + void (*c_func)(void *); + void *c_arg; + +}; + +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ +#define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */ + +#define C_ABSOLUTE 0x0200 /* event time is absolute. */ + +#define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE) +#define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE) +#define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) + +void vmm_glue_callout_init(struct callout *c, int mpsafe); +int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, + sbintime_t pr, void (*func)(void *), void *arg, int flags); +int vmm_glue_callout_stop(struct callout *c); +int vmm_glue_callout_drain(struct callout *c); + +static __inline void +callout_init(struct callout *c, int mpsafe) +{ + vmm_glue_callout_init(c, mpsafe); +} + +static __inline int +callout_stop(struct callout *c) +{ + return (vmm_glue_callout_stop(c)); +} + +static __inline int +callout_drain(struct callout *c) +{ + return (vmm_glue_callout_drain(c)); +} + +static __inline int +callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr, + void (*func)(void *), void *arg, int flags) +{ + return (vmm_glue_callout_reset_sbt(c, sbt, pr, func, arg, flags)); +} + + +#endif /* _COMPAT_FREEBSD_SYS_CALLOUT_H_ */ diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h new file mode 100644 index 0000000000..974e323dbe --- /dev/null +++ b/usr/src/compat/freebsd/sys/cdefs.h @@ -0,0 +1,58 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ +#define _COMPAT_FREEBSD_SYS_CDEFS_H_ + +#define __FBSDID(s) + +#ifdef __GNUC__ +#define inline __inline + +#define __GNUCLIKE___SECTION 1 + +#define __dead2 __attribute__((__noreturn__)) +#define __unused __attribute__((__unused__)) +#define __used __attribute__((__used__)) +#define __packed __attribute__((__packed__)) +#define __aligned(x) __attribute__((__aligned__(x))) +#define __section(x) __attribute__((__section__(x))) +#endif + +/* + * The __CONCAT macro is used to concatenate parts of symbol names, e.g. + * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. + * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI + * mode -- there must be no spaces between its arguments, and for nested + * __CONCAT's, all the __CONCAT's must be at the left. __CONCAT can also + * concatenate double-quoted strings produced by the __STRING macro, but + * this only works with ANSI C. + * + * __XSTRING is like __STRING, but it expands any macros in its argument + * first. It is only available with ANSI C. + */ +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* full-blown ANSI C */ +#define __CONCAT1(x,y) x ## y +#define __CONCAT(x,y) __CONCAT1(x,y) +#define __STRING(x) #x /* stringify without expanding x */ +#define __XSTRING(x) __STRING(x) /* expand x, then stringify */ +#else /* !(__STDC__ || __cplusplus) */ +#define __P(protos) () /* traditional C preprocessor */ +#define __CONCAT(x,y) x/**/y +#define __STRING(x) "x" +#endif /* !(__STDC__ || __cplusplus) */ + +#endif /* _COMPAT_FREEBSD_SYS_CDEFS_H_ */ diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h new file mode 100644 index 0000000000..8527624b5e --- /dev/null +++ b/usr/src/compat/freebsd/sys/cpuset.h @@ -0,0 +1,44 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_ +#define _COMPAT_FREEBSD_SYS_CPUSET_H_ + +#define NOCPU -1 + +#ifdef _KERNEL +#define CPU_SET(cpu, set) CPUSET_ADD(*(set), cpu) +#define CPU_SETOF(cpu, set) CPUSET_ONLY(*(set), cpu) +#define CPU_ZERO(set) CPUSET_ZERO(*(set)) +#define CPU_CLR(cpu, set) CPUSET_DEL(*(set), cpu) +#define CPU_FFS(set) cpusetobj_ffs(set) +#define CPU_ISSET(cpu, set) CPU_IN_SET(*(set), cpu) +#define CPU_CMP(set1, set2) CPUSET_ISEQUAL(*(set1), *(set2)) +#define CPU_SET_ATOMIC(cpu, set) CPUSET_ATOMIC_ADD(*(set), cpu) + +#include <sys/cpuvar.h> + +int cpusetobj_ffs(const cpuset_t *set); +#else +#include <machine/atomic.h> + +typedef int cpuset_t; + +#define CPUSET(cpu) (1UL << (cpu)) + +#define CPU_SET_ATOMIC(cpu, set) atomic_set_int((set), CPUSET(cpu)) +#endif + +#endif /* _COMPAT_FREEBSD_SYS_CPUSET_H_ */ diff --git a/usr/src/compat/freebsd/sys/disk.h b/usr/src/compat/freebsd/sys/disk.h new file mode 100644 index 0000000000..c9bdc6a2d8 --- /dev/null +++ b/usr/src/compat/freebsd/sys/disk.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_DISK_H_ +#define _COMPAT_FREEBSD_SYS_DISK_H_ + +#endif /* _COMPAT_FREEBSD_SYS_DISK_H_ */ diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h new file mode 100644 index 0000000000..a31bff55d6 --- /dev/null +++ b/usr/src/compat/freebsd/sys/endian.h @@ -0,0 +1,125 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_ +#define _COMPAT_FREEBSD_SYS_ENDIAN_H_ + +static __inline uint16_t +be16dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return ((p[0] << 8) | p[1]); +} + +static __inline uint32_t +be32dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((unsigned)p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]); +} + +static __inline uint64_t +be64dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((uint64_t)be32dec(p) << 32) | be32dec(p + 4)); +} + +static __inline uint16_t +le16dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return ((p[1] << 8) | p[0]); +} + +static __inline uint32_t +le32dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((unsigned)p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]); +} + +static __inline uint64_t +le64dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((uint64_t)le32dec(p + 4) << 32) | le32dec(p)); +} + +static __inline void +be16enc(void *pp, uint16_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = (u >> 8) & 0xff; + p[1] = u & 0xff; +} + +static __inline void +be32enc(void *pp, uint32_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = (u >> 24) & 0xff; + p[1] = (u >> 16) & 0xff; + p[2] = (u >> 8) & 0xff; + p[3] = u & 0xff; +} + +static __inline void +be64enc(void *pp, uint64_t u) +{ + uint8_t *p = (uint8_t *)pp; + + be32enc(p, (uint32_t)(u >> 32)); + be32enc(p + 4, (uint32_t)(u & 0xffffffffU)); +} + +static __inline void +le16enc(void *pp, uint16_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; +} + +static __inline void +le32enc(void *pp, uint32_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; + p[2] = (u >> 16) & 0xff; + p[3] = (u >> 24) & 0xff; +} + +static __inline void +le64enc(void *pp, uint64_t u) +{ + uint8_t *p = (uint8_t *)pp; + + le32enc(p, (uint32_t)(u & 0xffffffffU)); + le32enc(p + 4, (uint32_t)(u >> 32)); +} + +#endif /* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */ diff --git a/usr/src/compat/freebsd/sys/errno.h b/usr/src/compat/freebsd/sys/errno.h new file mode 100644 index 0000000000..bd37f43065 --- /dev/null +++ b/usr/src/compat/freebsd/sys/errno.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_ERRNO_H_ +#define _COMPAT_FREEBSD_SYS_ERRNO_H_ + +#ifndef _KERNEL +extern int *___errno(); + +#define errno (*(___errno())) +#endif + +#include_next <sys/errno.h> + +#endif /* _COMPAT_FREEBSD_SYS_ERRNO_H_ */ diff --git a/usr/src/compat/freebsd/sys/fcntl.h b/usr/src/compat/freebsd/sys/fcntl.h new file mode 100644 index 0000000000..062a3b84ac --- /dev/null +++ b/usr/src/compat/freebsd/sys/fcntl.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_FCNTL_H_ +#define _COMPAT_FREEBSD_SYS_FCNTL_H_ + +#define O_DIRECT 0x0 + +#include_next <sys/fcntl.h> + +#endif /* _COMPAT_FREEBSD_SYS_FCNTL_H_ */ diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h new file mode 100644 index 0000000000..e223e1e4c7 --- /dev/null +++ b/usr/src/compat/freebsd/sys/ioctl.h @@ -0,0 +1,22 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_IOCTL_H_ +#define _COMPAT_FREEBSD_SYS_IOCTL_H_ + +#include <sys/ioccom.h> +#include_next <sys/ioctl.h> + +#endif /* _COMPAT_FREEBSD_SYS_IOCTL_H_ */ diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h new file mode 100644 index 0000000000..b1c07674e4 --- /dev/null +++ b/usr/src/compat/freebsd/sys/kernel.h @@ -0,0 +1,25 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_ +#define _COMPAT_FREEBSD_SYS_KERNEL_H_ + +#define SYSINIT(uniquifier, subsystem, order, func, ident) + +#include <sys/linker_set.h> + +#define ticks ddi_get_lbolt() + +#endif /* _COMPAT_FREEBSD_SYS_KERNEL_H_ */ diff --git a/usr/src/compat/freebsd/sys/ktr.h b/usr/src/compat/freebsd/sys/ktr.h new file mode 100644 index 0000000000..96c499ef18 --- /dev/null +++ b/usr/src/compat/freebsd/sys/ktr.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_KTR_H_ +#define _COMPAT_FREEBSD_SYS_KTR_H_ + +#define CTR0(m, format) +#define CTR1(m, format, p1) +#define CTR2(m, format, p1, p2) +#define CTR3(m, format, p1, p2, p3) +#define CTR4(m, format, p1, p2, p3, p4) +#define CTR5(m, format, p1, p2, p3, p4, p5) +#define CTR6(m, d, p1, p2, p3, p4, p5, p6) + +#endif /* _COMPAT_FREEBSD_SYS_KTR_H_ */ diff --git a/usr/src/compat/freebsd/sys/libkern.h b/usr/src/compat/freebsd/sys/libkern.h new file mode 100644 index 0000000000..94675a0d66 --- /dev/null +++ b/usr/src/compat/freebsd/sys/libkern.h @@ -0,0 +1,25 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_LIBKERN_H_ +#define _COMPAT_FREEBSD_SYS_LIBKERN_H_ + +#include <sys/systm.h> + +#ifndef min +static __inline u_int min(u_int a, u_int b) { return (a < b ? a : b); } +#endif + +#endif /* _COMPAT_FREEBSD_SYS_LIBKERN_H_ */ diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h new file mode 100644 index 0000000000..99ae0f4d64 --- /dev/null +++ b/usr/src/compat/freebsd/sys/limits.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_ +#define _COMPAT_FREEBSD_SYS_LIMITS_H_ + +#endif /* _COMPAT_FREEBSD_SYS_LIMITS_H_ */ diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h new file mode 100644 index 0000000000..579df44533 --- /dev/null +++ b/usr/src/compat/freebsd/sys/malloc.h @@ -0,0 +1,44 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_MALLOC_H_ +#define _COMPAT_FREEBSD_SYS_MALLOC_H_ + +/* + * flags to malloc. + */ +#define M_NOWAIT 0x0001 /* do not block */ +#define M_WAITOK 0x0002 /* ok to block */ +#define M_ZERO 0x0100 /* bzero the allocation */ + +struct malloc_type { + const char *ks_shortdesc; /* Printable type name. */ +}; + +#ifdef _KERNEL +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ + struct malloc_type type[1] = { \ + { shortdesc } \ + } + +#define MALLOC_DECLARE(type) \ + extern struct malloc_type type[1] + +void free(void *addr, struct malloc_type *type); +void *malloc(unsigned long size, struct malloc_type *type, int flags); +void *old_malloc(unsigned long size, struct malloc_type *type , int flags); +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_MALLOC_H_ */ diff --git a/usr/src/compat/freebsd/sys/module.h b/usr/src/compat/freebsd/sys/module.h new file mode 100644 index 0000000000..87b73e3fa3 --- /dev/null +++ b/usr/src/compat/freebsd/sys/module.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_ +#define _COMPAT_FREEBSD_SYS_MODULE_H_ + +#endif /* _COMPAT_FREEBSD_SYS_MODULE_H_ */ diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h new file mode 100644 index 0000000000..b99884b652 --- /dev/null +++ b/usr/src/compat/freebsd/sys/mutex.h @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_ +#define _COMPAT_FREEBSD_SYS_MUTEX_H_ + +#ifdef _KERNEL + +#include <sys/debug.h> + +#define MTX_DEF 0x00000000 +#define MTX_SPIN 0x00000001 + +struct mtx; + +void mtx_init(struct mtx *, char *name, const char *type_name, int opts); +void mtx_destroy(struct mtx *); + +int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg, + int timo); + +#endif /* KERNEL */ +#include_next <sys/mutex.h> +#ifdef _KERNEL + +struct mtx { + kmutex_type_t t; + kmutex_t m; +}; + +static __inline void mtx_lock(struct mtx *mtx) +{ + mutex_enter(&mtx->m); +} + +static __inline void mtx_unlock(struct mtx *mtx) +{ + mutex_exit(&mtx->m); +} + +static __inline void mtx_lock_spin(struct mtx *mtx) +{ + mutex_enter(&mtx->m); +} + +static __inline void mtx_unlock_spin(struct mtx *mtx) +{ + mutex_exit(&mtx->m); +} + +static __inline int mtx_owned(struct mtx *mtx) +{ + return (mutex_owned(&mtx->m)); +} + +#define MA_OWNED 0 + +static __inline void mtx_assert(struct mtx *mtx, int what) +{ + switch (what) { + case MA_OWNED: + ASSERT(mutex_owned(&mtx->m)); + break; + } +} + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_MUTEX_H_ */ diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h new file mode 100644 index 0000000000..f09e9183f6 --- /dev/null +++ b/usr/src/compat/freebsd/sys/param.h @@ -0,0 +1,48 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_PARAM_H_ +#define _COMPAT_FREEBSD_SYS_PARAM_H_ + +#ifndef _KERNEL +#define MAXCOMLEN 16 +#endif +#define MAXHOSTNAMELEN 256 + +#ifdef _KERNEL +#include <sys/time.h> + +#ifndef FALSE +#define FALSE 0 +#endif +#ifndef TRUE +#define TRUE 1 +#endif +#endif + +#include <machine/param.h> + +#define nitems(x) (sizeof((x)) / sizeof((x)[0])) +#define rounddown(x,y) (((x)/(y))*(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ +#define roundup2(x,y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ + +/* Macros for min/max. */ +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +#include_next <sys/param.h> + +#endif /* _COMPAT_FREEBSD_SYS_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/sys/pcpu.h b/usr/src/compat/freebsd/sys/pcpu.h new file mode 100644 index 0000000000..f29c9c5018 --- /dev/null +++ b/usr/src/compat/freebsd/sys/pcpu.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_PCPU_H_ +#define _COMPAT_FREEBSD_SYS_PCPU_H_ + +#define curcpu (CPU->cpu_id) + +#endif /* _COMPAT_FREEBSD_SYS_PCPU_H_ */ diff --git a/usr/src/compat/freebsd/sys/sched.h b/usr/src/compat/freebsd/sys/sched.h new file mode 100644 index 0000000000..b426ee757e --- /dev/null +++ b/usr/src/compat/freebsd/sys/sched.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SCHED_H_ +#define _COMPAT_FREEBSD_SYS_SCHED_H_ + +#endif /* _COMPAT_FREEBSD_SYS_SCHED_H_ */ diff --git a/usr/src/compat/freebsd/sys/select.h b/usr/src/compat/freebsd/sys/select.h new file mode 100644 index 0000000000..fcb40c23b1 --- /dev/null +++ b/usr/src/compat/freebsd/sys/select.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_ +#define _COMPAT_FREEBSD_SYS_MODULE_H_ + +void *memset(void *s, int c, size_t n); + +#include_next <sys/select.h> + +#endif /* _COMPAT_FREEBSD_SYS_MODULE_H_ */ diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h new file mode 100644 index 0000000000..46183e8677 --- /dev/null +++ b/usr/src/compat/freebsd/sys/smp.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SMP_H_ +#define _COMPAT_FREEBSD_SYS_SMP_H_ + +#include <sys/cpuset.h> + +void smp_rendezvous(void (*)(void *), + void (*)(void *), + void (*)(void *), + void *arg); + +void ipi_cpu(int cpu, u_int ipi); + +#endif /* _COMPAT_FREEBSD_SYS_SMP_H_ */ diff --git a/usr/src/compat/freebsd/sys/sysctl.h b/usr/src/compat/freebsd/sys/sysctl.h new file mode 100644 index 0000000000..9f6a695e34 --- /dev/null +++ b/usr/src/compat/freebsd/sys/sysctl.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SYSCTL_H_ +#define _COMPAT_FREEBSD_SYS_SYSCTL_H_ + +#define SYSCTL_DECL(name) + +#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) + +#define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) +#define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) +#define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr) + +#endif /* _COMPAT_FREEBSD_SYS_SYSCTL_H_ */ diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h new file mode 100644 index 0000000000..e25acc0e4a --- /dev/null +++ b/usr/src/compat/freebsd/sys/systm.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SYSTM_H_ +#define _COMPAT_FREEBSD_SYS_SYSTM_H_ + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <sys/callout.h> +#include <sys/queue.h> + +struct mtx; + +#define KASSERT(exp,msg) do { \ + if (!(exp)) \ + panic msg; \ +} while (0) + +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x,y) __CTASSERT(x,y) +#define __CTASSERT(x,y) typedef char __assert ## y[(x) ? 1 : -1] + +void critical_enter(void); +void critical_exit(void); + +int msleep_spin(void *chan, struct mtx *mutex, const char *wmesg, + int ticks); +void wakeup(void *chan); +void wakeup_one(void *chan); + +struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); +void delete_unrhdr(struct unrhdr *uh); +int alloc_unr(struct unrhdr *uh); +void free_unr(struct unrhdr *uh, u_int item); + +#include <sys/libkern.h> + +#include_next <sys/systm.h> +#include <sys/cmn_err.h> + +#endif /* _COMPAT_FREEBSD_SYS_SYSTM_H_ */ diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h new file mode 100644 index 0000000000..f8f9da5cdf --- /dev/null +++ b/usr/src/compat/freebsd/sys/time.h @@ -0,0 +1,104 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_TIME_H_ +#define _COMPAT_FREEBSD_SYS_TIME_H_ + +#include_next <sys/time.h> + +#define tc_precexp 0 + +struct bintime { + ulong_t sec; /* seconds */ + uint64_t frac; /* 64 bit fraction of a second */ +}; + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +static __inline void +binuptime(struct bintime *bt) +{ + hrtime_t now = gethrtime(); + + bt->sec = now / 1000000000; + /* 18446744073 = int(2^64 / 1000000000) = 1ns in 64-bit fractions */ + bt->frac = (now % 1000000000) * (uint64_t)18446744073LL; +} + +#define bintime_cmp(a, b, cmp) \ + (((a)->sec == (b)->sec) ? \ + ((a)->frac cmp (b)->frac) : \ + ((a)->sec cmp (b)->sec)) + +#define SBT_1US (1000) + +static __inline void +bintime_add(struct bintime *bt, const struct bintime *bt2) +{ + uint64_t u; + + u = bt->frac; + bt->frac += bt2->frac; + if (u > bt->frac) + bt->sec++; + bt->sec += bt2->sec; +} + +static __inline void +bintime_sub(struct bintime *bt, const struct bintime *bt2) +{ + uint64_t u; + + u = bt->frac; + bt->frac -= bt2->frac; + if (u < bt->frac) + bt->sec--; + bt->sec -= bt2->sec; +} + +static __inline void +bintime_mul(struct bintime *bt, u_int x) +{ + uint64_t p1, p2; + + p1 = (bt->frac & 0xffffffffull) * x; + p2 = (bt->frac >> 32) * x + (p1 >> 32); + bt->sec *= x; + bt->sec += (p2 >> 32); + bt->frac = (p2 << 32) | (p1 & 0xffffffffull); +} + +static __inline sbintime_t +bttosbt(const struct bintime bt) +{ + return ((bt.sec * 1000000000) + + (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32)); +} + +static __inline sbintime_t +sbinuptime(void) +{ + return (gethrtime()); +} + +#endif /* _COMPAT_FREEBSD_SYS_TIME_H_ */ diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h new file mode 100644 index 0000000000..6fc8179f2e --- /dev/null +++ b/usr/src/compat/freebsd/sys/types.h @@ -0,0 +1,74 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_TYPES_H_ +#define _COMPAT_FREEBSD_SYS_TYPES_H_ + +#include <sys/_types.h> + +typedef __uint8_t u_int8_t; /* unsigned integrals (deprecated) */ +typedef __uint16_t u_int16_t; +typedef __uint32_t u_int32_t; +typedef __uint64_t u_int64_t; + +#ifndef __REGISTER_T_DEFINED +#define __REGISTER_T_DEFINED +typedef __register_t register_t; +#endif + +#ifndef __SBINTIME_T_DEFINED +#define __SBINTIME_T_DEFINED +typedef __int64_t sbintime_t; +#endif + +#ifndef __VM_MEMATTR_T_DEFINED +#define __VM_MEMATTR_T_DEFINED +typedef char vm_memattr_t; +#endif + +#ifndef __VM_OFFSET_T_DEFINED +#define __VM_OFFSET_T_DEFINED +typedef __vm_offset_t vm_offset_t; +#endif + +#ifndef __VM_OOFFSET_T_DEFINED +#define __VM_OOFFSET_T_DEFINED +typedef __vm_ooffset_t vm_ooffset_t; +#endif + +#ifndef __VM_PADDR_T_DEFINED +#define __VM_PADDR_T_DEFINED +typedef __vm_paddr_t vm_paddr_t; +#endif + +#ifndef __VM_MEMATTR_T_DEFINED +#define __VM_MEMATTR_T_DEFINED +typedef char vm_memattr_t; +#endif + +#ifndef __bool_true_false_are_defined +#define __bool_true_false_are_defined 1 +#define false 0 +#define true 1 +typedef _Bool bool; +#endif + +#if defined(_KERNEL) && !defined(offsetof) +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#include_next <sys/types.h> + +#endif /* _COMPAT_FREEBSD_SYS_TYPES_H_ */ diff --git a/usr/src/compat/freebsd/sys/uio.h b/usr/src/compat/freebsd/sys/uio.h new file mode 100644 index 0000000000..05c6f2a028 --- /dev/null +++ b/usr/src/compat/freebsd/sys/uio.h @@ -0,0 +1,26 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_UIO_H_ +#define _COMPAT_FREEBSD_SYS_UIO_H_ + +#include_next <sys/uio.h> + +#ifndef _KERNEL +ssize_t preadv(int, const struct iovec *, int, off_t); +ssize_t pwritev(int, const struct iovec *, int, off_t); +#endif + +#endif /* _COMPAT_FREEBSD_SYS_UIO_H_ */ diff --git a/usr/src/compat/freebsd/termios.h b/usr/src/compat/freebsd/termios.h new file mode 100644 index 0000000000..feaa705358 --- /dev/null +++ b/usr/src/compat/freebsd/termios.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_TERMIOS_H_ +#define _COMPAT_FREEBSD_TERMIOS_H_ + +#include_next <termios.h> + +void cfmakeraw(struct termios *); + +#endif /* _COMPAT_FREEBSD_TERMIOS_H_ */ diff --git a/usr/src/compat/freebsd/uuid.h b/usr/src/compat/freebsd/uuid.h new file mode 100644 index 0000000000..72ef2c7787 --- /dev/null +++ b/usr/src/compat/freebsd/uuid.h @@ -0,0 +1,55 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_UUID_H_ +#define _COMPAT_FREEBSD_UUID_H_ + +#include <sys/endian.h> +#include <uuid/uuid.h> + +/* Status codes returned by the functions. */ +#define uuid_s_ok 0 +#define uuid_s_bad_version 1 +#define uuid_s_invalid_string_uuid 2 + +static __inline void +uuid_from_string(char *str, uuid_t *uuidp, uint32_t *status) +{ + if (uuid_parse(str, *uuidp) == 0) { + *status = uuid_s_ok; + } else { + *status = uuid_s_invalid_string_uuid; + } +} + +static __inline void +uuid_enc_le(void *buf, uuid_t *uuidp) +{ + uchar_t *p; + int i; + + p = buf; + be32enc(p, ((struct uuid *)uuidp)->time_low); + be16enc(p + 4, ((struct uuid *)uuidp)->time_mid); + be16enc(p + 6, ((struct uuid *)uuidp)->time_hi_and_version); + p[8] = ((struct uuid *)uuidp)->clock_seq_hi_and_reserved; + p[9] = ((struct uuid *)uuidp)->clock_seq_low; + + for (i = 0; i < 6; i++) + p[10 + i] = ((struct uuid *)uuidp)->node_addr[i]; + +} + +#endif /* _COMPAT_FREEBSD_UUID_H_ */ diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/vm/pmap.h new file mode 100644 index 0000000000..5958c4b101 --- /dev/null +++ b/usr/src/compat/freebsd/vm/pmap.h @@ -0,0 +1,21 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_VM_PMAP_H_ +#define _COMPAT_FREEBSD_VM_PMAP_H_ + +#include <machine/pmap.h> + +#endif /* _COMPAT_FREEBSD_VM_PMAP_H_ */ diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h new file mode 100644 index 0000000000..7da22099b6 --- /dev/null +++ b/usr/src/compat/freebsd/vm/vm.h @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _FREEBSD_VM_VM_H_ +#define _FREEBSD_VM_VM_H_ + +#include <machine/vm.h> + +typedef u_char vm_prot_t; + +#define VM_PROT_NONE ((vm_prot_t) 0x00) +#define VM_PROT_READ ((vm_prot_t) 0x01) +#define VM_PROT_WRITE ((vm_prot_t) 0x02) +#define VM_PROT_EXECUTE ((vm_prot_t) 0x04) + +#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) +#define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) + +/* + * <sys/promif.h> contains a troublesome preprocessor define for BYTE. + * Do this ugly workaround to avoid it. + */ +#define _SYS_PROMIF_H +#include <vm/hat_i86.h> +#undef _SYS_PROMIF_H + +#endif /* _FREEBSD_VM_VM_H_ */ diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h new file mode 100644 index 0000000000..a07fc017ad --- /dev/null +++ b/usr/src/compat/freebsd/x86/_types.h @@ -0,0 +1,49 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _FREEBSD_X86__TYPES_H_ +#define _FREEBSD_X86__TYPES_H_ + +/* + * Basic types upon which most other types are built. + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef short __int16_t; +typedef unsigned short __uint16_t; +typedef int __int32_t; +typedef unsigned int __uint32_t; +#ifdef _LP64 +typedef long __int64_t; +typedef unsigned long __uint64_t; +#else +typedef long long __int64_t; +typedef unsigned long long __uint64_t; +#endif + +/* + * Standard type definitions. + */ +#ifdef _LP64 +typedef __int64_t __register_t; +typedef __uint64_t __vm_offset_t; +typedef __uint64_t __vm_paddr_t; +typedef __int64_t __vm_ooffset_t; +#else +typedef __int32_t __register_t; +typedef __uint32_t __vm_paddr_t; +#endif + +#endif /* _FREEBSD_X86__TYPES_H_ */ diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h new file mode 100644 index 0000000000..bc6ba976b8 --- /dev/null +++ b/usr/src/compat/freebsd/x86/segments.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_ +#define _COMPAT_FREEBSD_X86_SEGMENTS_H_ + +/* + * Entries in the Interrupt Descriptor Table (IDT) + */ +#define IDT_BP 3 /* #BP: Breakpoint */ +#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ +#define IDT_SS 12 /* #SS: Stack Segment Fault */ +#define IDT_GP 13 /* #GP: General Protection Fault */ +#define IDT_AC 17 /* #AC: Alignment Check */ + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */ diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h new file mode 100644 index 0000000000..8c79ca1ccc --- /dev/null +++ b/usr/src/head/bhyve.h @@ -0,0 +1,25 @@ +/* + * COPYRIGHT 2013 Pluribus Networks Inc. + * + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + */ +#ifndef _BHYVE_H +#define _BHYVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define BHYVE_TMPDIR "/var/run/bhyve" +#define BHYVE_CONS_SOCKPATH BHYVE_TMPDIR "/%s.console_sock" + +#ifdef __cplusplus +} +#endif + +#endif /* _BHYVE_H */ diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile new file mode 100644 index 0000000000..60621fcb75 --- /dev/null +++ b/usr/src/lib/libvmmapi/Makefile @@ -0,0 +1,49 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.lib + +HDRS = vmmapi.h + +HDRDIR = common + +$(BUILD64)SUBDIRS += $(MACH64) + +all:= TARGET= all +install:= TARGET= install +clean:= TARGET= clean +clobber:= TARGET= clobber +lint:= TARGET= lint +_msg:= TARGET= _msg + +.KEEP_STATE: + +all install clean clobber lint: $(SUBDIRS) + +# install rule for install_h target + +install_h: $(ROOTHDRS) + +check: $(CHECKHDRS) + +_msg: $(MSGSUBDIRS) + +$(SUBDIRS): FRC + cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) + +FRC: + +include ../Makefile.targ +include ../../Makefile.msg.targ diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com new file mode 100644 index 0000000000..e41a82f9a2 --- /dev/null +++ b/usr/src/lib/libvmmapi/Makefile.com @@ -0,0 +1,53 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +LIBRARY = libvmmapi.a +VERS = .1 + +OBJECTS = vmmapi.o expand_number.o + +# include library definitions +include ../../Makefile.lib + +# install this library in the root filesystem +include ../../Makefile.rootfs + +SRCDIR = ../common + +LIBS = $(DYNLIB) $(LINTLIB) + +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + $(CPPFLAGS.master) -I$(SRC)/uts/i86pc + +$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC) + +LDLIBS += -lc + +.KEEP_STATE: + +all: $(LIBS) + +lint: lintcheck + +pics/%.o: $(CONTRIB)/freebsd/lib/libutil/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +pics/%.o: ../common/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +# include library targets +include ../../Makefile.targ diff --git a/usr/src/lib/libvmmapi/amd64/Makefile b/usr/src/lib/libvmmapi/amd64/Makefile new file mode 100644 index 0000000000..b5cac1ffce --- /dev/null +++ b/usr/src/lib/libvmmapi/amd64/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +include ../Makefile.com +include ../../Makefile.lib.64 + +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT64) diff --git a/usr/src/lib/libvmmapi/common/llib-lvmmapi b/usr/src/lib/libvmmapi/common/llib-lvmmapi new file mode 100644 index 0000000000..221ed3a23e --- /dev/null +++ b/usr/src/lib/libvmmapi/common/llib-lvmmapi @@ -0,0 +1,2 @@ +/* LINTLIBRARY */ +/* PROTOLIB1 */ diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers new file mode 100644 index 0000000000..7a8443a2b8 --- /dev/null +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -0,0 +1,77 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +SUNWprivate_1.0 { + global: + vcpu_reset; + vm_activate_cpu; + vm_apicid2vcpu; + vm_capability_name2type; + vm_capability_type2name; + vm_copy_setup; + vm_copy_teardown; + vm_copyin; + vm_copyout; + vm_create; + vm_destroy; + vm_get_capability; + vm_get_desc; + vm_get_highmem_size; + vm_get_lowmem_limit; + vm_get_lowmem_size; + vm_get_memory_seg; + vm_get_register; + vm_get_seg_desc; + vm_get_x2apic_state; + vm_gla2gpa; + vm_inject_exception; + vm_isa_assert_irq; + vm_isa_deassert_irq; + vm_isa_pulse_irq; + vm_isa_set_irq_trigger; + vm_ioapic_assert_irq; + vm_ioapic_deassert_irq; + vm_ioapic_pincount; + vm_ioapic_pulse_irq; + vm_lapic_irq; + vm_lapic_msi; + vm_map_gpa; + vm_open; + vm_parse_memsize; + vm_restart_instruction; + vm_run; + vm_set_capability; + vm_set_desc; + vm_set_register; + vm_set_x2apic_state; + vm_setup_memory; + vm_setup_rom; + local: + *; +}; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c new file mode 100644 index 0000000000..bbab3961a9 --- /dev/null +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -0,0 +1,1257 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $"); + +#include <sys/param.h> +#include <sys/sysctl.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/_iovec.h> +#include <sys/cpuset.h> + +#include <machine/specialreg.h> + +#ifndef __FreeBSD__ +#include <errno.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> + +#include <libutil.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#ifndef __FreeBSD__ +#include <sys/vmm_impl.h> +#endif + +#include "vmmapi.h" + +#define KB (1024UL) +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +struct vmctx { + int fd; + uint32_t lowmem_limit; + enum vm_mmap_style vms; + char *lowermem_addr; + char *biosmem_addr; + size_t lowmem; + char *lowmem_addr; + size_t highmem; + char *highmem_addr; + uint64_t rombase; + uint64_t romlimit; + char *rom_addr; + char *name; +}; + +#ifdef __FreeBSD__ +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) +#else +#define CREATE(x) vmm_vm_create(x) +#define DESTROY(x) vmm_vm_destroy(x) +#endif + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + +#ifdef __FreeBSD__ + len = strlen("/dev/vmm/") + strlen(name) + 1; +#else + len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1; +#endif + vmfile = malloc(len); + assert(vmfile != NULL); +#ifdef __FreeBSD__ + snprintf(vmfile, len, "/dev/vmm/%s", name); +#else + snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name); +#endif + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +#ifndef __FreeBSD__ +static int +vmm_vm_create(const char *name) +{ + const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; + struct vmm_ioctl vi; + int err = 0; + int ctl_fd; + + (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); + + ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); + if (ctl_fd == -1) { + err = errno; + if ((errno == EPERM) || (errno == EACCES)) { + fprintf(stderr, "you do not have permission to " + "perform that operation.\n"); + } else { + fprintf(stderr, "open: %s: %s\n", vmm_ctl, + strerror(errno)); + } + return (err); + } + if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) { + err = errno; + fprintf(stderr, "couldn't create vm \"%s\"", name); + } + close (ctl_fd); + + return (err); +} +#endif + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->lowmem_limit = 3 * GB; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + (void) vm_destroy(vm); + return (NULL); +} + +#ifndef __FreeBSD__ +static int +vmm_vm_destroy(const char *name) +{ + const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; + struct vmm_ioctl vi; + int ctl_fd; + int err = 0; + + (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); + + ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); + if (ctl_fd == -1) { + err = errno; + if ((errno == EPERM) || (errno == EACCES)) { + fprintf(stderr, "you do not have permission to " + "perform that operation.\n"); + } else { + fprintf(stderr, "open: %s: %s\n", vmm_ctl, + strerror(errno)); + } + return (err); + } + if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) { + err = errno; + fprintf(stderr, "couldn't destroy vm \"%s\"", name); + } + close (ctl_fd); + return (err); +} +#endif + +int +vm_destroy(struct vmctx *vm) +{ + int err; + assert(vm != NULL); + + if (vm->fd >= 0) + close(vm->fd); + err = DESTROY(vm->name); + + free(vm); + return (err); +} + +int +vm_parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(optarg, ret_memsize); + + return (error); +} + +#ifdef __FreeBSD__ +size_t +vmm_get_mem_total(void) +{ + size_t mem_total = 0; + size_t oldlen = sizeof(mem_total); + int error; + error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0); + if (error) + return -1; + return mem_total; +} + +size_t +vmm_get_mem_free(void) +{ + size_t mem_free = 0; + size_t oldlen = sizeof(mem_free); + int error; + error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0); + if (error) + return -1; + return mem_free; +} +#endif + +int +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_len = seg.len; + if (wired != NULL) + *wired = seg.wired; + return (error); +} + +uint32_t +vm_get_lowmem_limit(struct vmctx *ctx) +{ + + return (ctx->lowmem_limit); +} + +void +vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) +{ + + ctx->lowmem_limit = limit; +} + +static int +setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr) +{ + int error; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && addr != NULL) { + *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa); + } + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) +{ + char **addr; + int error; + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL); + ctx->vms = vms; + + /* + * If 'memsize' cannot fit entirely in the 'lowmem' segment then + * create another 'highmem' segment above 4GB for the remainder. + */ + if (memsize > ctx->lowmem_limit) { + ctx->lowmem = ctx->lowmem_limit; + ctx->highmem = memsize - ctx->lowmem; + } else { + ctx->lowmem = memsize; + ctx->highmem = 0; + } + + if (ctx->lowmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL; + error = setup_memory_segment(ctx, 0, 640*KB, addr); + if (error) + return (error); + + addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL; + error = setup_memory_segment(ctx, 768*KB, 256*KB, addr); + if (error) + return (error); + + addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL; + error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr); + if (error) + return (error); + } + + if (ctx->highmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL; + error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr); + if (error) + return (error); + } + + return (0); +} + +int +vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len) +{ + ctx->rombase = gpa; + ctx->romlimit = gpa + len; + + return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr)); +} + +void * +vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) +{ + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(ctx->vms == VM_MMAP_ALL); + + if (gaddr + len <= 1*MB) { + if (gaddr + len <= 640*KB) + return ((void *)(ctx->lowermem_addr + gaddr)); + + if (768*KB <= gaddr && gaddr + len <= 1*MB) { + gaddr -= 768*KB; + return ((void *)(ctx->biosmem_addr + gaddr)); + } + + return (NULL); + } + + if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) { + gaddr -= 1*MB; + return ((void *)(ctx->lowmem_addr + gaddr)); + } + + if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) { + gaddr -= ctx->rombase; + return ((void *)(ctx->rom_addr + gaddr)); + } + + if (gaddr >= 4*GB) { + gaddr -= 4*GB; + if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem) + return ((void *)(ctx->highmem_addr + gaddr)); + } + + return (NULL); +} + +size_t +vm_get_lowmem_size(struct vmctx *ctx) +{ + + return (ctx->lowmem); +} + +size_t +vm_get_highmem_size(struct vmctx *ctx) +{ + + return (ctx->highmem); +} + +int +vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + vmsegdesc.desc.base = base; + vmsegdesc.desc.limit = limit; + vmsegdesc.desc.access = access; + + error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); + return (error); +} + +int +vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); + if (error == 0) { + *base = vmsegdesc.desc.base; + *limit = vmsegdesc.desc.limit; + *access = vmsegdesc.desc.access; + } + return (error); +} + +int +vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +{ + int error; + + error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, + &seg_desc->access); + return (error); +} + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +static int +vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, + int error_code, int error_code_valid) +{ + struct vm_exception exc; + + bzero(&exc, sizeof(exc)); + exc.cpuid = vcpu; + exc.vector = vector; + exc.error_code = error_code; + exc.error_code_valid = error_code_valid; + + return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); +} + +int +vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vm_exception exc; + + exc.cpuid = vcpu; + exc.vector = vector; + exc.error_code = errcode; + exc.error_code_valid = errcode_valid; + exc.restart_instruction = restart_instruction; + + return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); +} + +int +vm_apicid2vcpu(struct vmctx *ctx, int apicid) +{ + /* + * The apic id associated with the 'vcpu' has the same numerical value + * as the 'vcpu' itself. + */ + return (apicid); +} + +int +vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); +} + +int +vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); +} + +int +vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) +{ + struct vm_lapic_msi vmmsi; + + bzero(&vmmsi, sizeof(vmmsi)); + vmmsi.addr = addr; + vmmsi.msg = msg; + + return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); +} + +int +vm_ioapic_assert_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_pincount(struct vmctx *ctx, int *pincount) +{ + + return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); +} + +int +vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); +} + +int +vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); +} + +int +vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); +} + +int +vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, + enum vm_intr_trigger trigger) +{ + struct vm_isa_irq_trigger isa_irq_trigger; + + bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); + isa_irq_trigger.atpic_irq = atpic_irq; + isa_irq_trigger.trigger = trigger; + + return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); +} + +int +vm_inject_nmi(struct vmctx *ctx, int vcpu) +{ + struct vm_nmi vmnmi; + + bzero(&vmnmi, sizeof(vmnmi)); + vmnmi.cpuid = vcpu; + + return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); +} + +static struct { + const char *name; + int type; +} capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { "enable_invpcid", VM_CAP_ENABLE_INVPCID }, + { 0 } +}; + +int +vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +const char * +vm_capability_type2name(int type) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL; i++) { + if (capstrmap[i].type == type) + return (capstrmap[i].name); + } + + return (NULL); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +int +vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); +} + +int +vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); +} + +int +vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + struct vm_pptdev_mmio pptmmio; + + bzero(&pptmmio, sizeof(pptmmio)); + pptmmio.bus = bus; + pptmmio.slot = slot; + pptmmio.func = func; + pptmmio.gpa = gpa; + pptmmio.len = len; + pptmmio.hpa = hpa; + + return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); +} + +int +vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + struct vm_pptdev_msi pptmsi; + + bzero(&pptmsi, sizeof(pptmsi)); + pptmsi.vcpu = vcpu; + pptmsi.bus = bus; + pptmsi.slot = slot; + pptmsi.func = func; + pptmsi.msg = msg; + pptmsi.addr = addr; + pptmsi.numvec = numvec; + + return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); +} + +int +vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + struct vm_pptdev_msix pptmsix; + + bzero(&pptmsix, sizeof(pptmsix)); + pptmsix.vcpu = vcpu; + pptmsix.bus = bus; + pptmsix.slot = slot; + pptmsix.func = func; + pptmsix.idx = idx; + pptmsix.msg = msg; + pptmsix.addr = addr; + pptmsix.vector_control = vector_control; + + return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); +} + +#ifdef __FreeBSD__ +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} +#endif + +int +vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) +{ + int error; + struct vm_x2apic x2apic; + + bzero(&x2apic, sizeof(x2apic)); + x2apic.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); + *state = x2apic.state; + return (error); +} + +int +vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) +{ + int error; + struct vm_x2apic x2apic; + + bzero(&x2apic, sizeof(x2apic)); + x2apic.cpuid = vcpu; + x2apic.state = state; + + error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); + + return (error); +} + +/* + * From Intel Vol 3a: + * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT + */ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + int error; + uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; + uint32_t desc_access, desc_limit; + uint16_t sel; + + zero = 0; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + rip = 0xfff0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + cr0 = CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) + goto done; + + cr4 = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + /* + * CS: present, r/w, accessed, 16-bit, byte granularity, usable + */ + desc_base = 0xffff0000; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0xf000; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) + goto done; + + /* + * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity + */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) + goto done; + + /* General purpose registers */ + rdx = 0xf00; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) + goto done; + + /* GDTR, IDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + /* TR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) + goto done; + + /* LDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x00000082; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, + desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* XXX cr2, debug registers */ + + error = 0; +done: + return (error); +} + +int +vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) +{ + int error, i; + struct vm_gpa_pte gpapte; + + bzero(&gpapte, sizeof(gpapte)); + gpapte.gpa = gpa; + + error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); + + if (error == 0) { + *num = gpapte.ptenum; + for (i = 0; i < gpapte.ptenum; i++) + pte[i] = gpapte.pte[i]; + } + + return (error); +} + +int +vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) +{ + int error; + struct vm_hpet_cap cap; + + bzero(&cap, sizeof(struct vm_hpet_cap)); + error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); + if (capabilities != NULL) + *capabilities = cap.capabilities; + return (error); +} + +static int +gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, int *fault, uint64_t *gpa) +{ + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.vcpuid = vcpu; + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; + + error = ioctl(ctx->fd, VM_GLA2GPA, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } + return (error); +} + +int +vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa) +{ + int error, fault; + + error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa); + if (fault) + error = fault; + return (error); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +int +vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) +{ + void *va; + uint64_t gpa; + int error, fault, i, n, off; + + for (i = 0; i < iovcnt; i++) { + iov[i].iov_base = 0; + iov[i].iov_len = 0; + } + + while (len) { + assert(iovcnt > 0); + error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa); + if (error) + return (-1); + if (fault) + return (1); + + off = gpa & PAGE_MASK; + n = min(len, PAGE_SIZE - off); + + va = vm_map_gpa(ctx, gpa, n); + if (va == NULL) + return (-1); + + iov->iov_base = va; + iov->iov_len = n; + iov++; + iovcnt--; + + gla += n; + len -= n; + } + return (0); +} + +void +vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) +{ + + return; +} + +void +vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) +{ + const char *src; + char *dst; + size_t n; + + dst = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + src = iov->iov_base; + bcopy(src, dst, n); + + iov++; + dst += n; + len -= n; + } +} + +void +vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, + size_t len) +{ + const char *src; + char *dst; + size_t n; + + src = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + dst = iov->iov_base; + bcopy(src, dst, n); + + iov++; + src += n; + len -= n; + } +} + +int +vm_activate_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpu) +{ + struct vmctx *ctx = arg; + + return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); +} diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h new file mode 100644 index 0000000000..d7eb67aa58 --- /dev/null +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +#include <sys/param.h> + +struct iovec; +struct vmctx; +enum x2apic_state; + +/* + * Different styles of mapping the memory assigned to a VM into the address + * space of the controlling process. + */ +enum vm_mmap_style { + VM_MMAP_NONE, /* no mapping */ + VM_MMAP_ALL, /* fully and statically mapped */ + VM_MMAP_SPARSE, /* mappings created on-demand */ +}; + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +int vm_destroy(struct vmctx *ctx); +int vm_parse_memsize(const char *optarg, size_t *memsize); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired); +int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); +int vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len); +void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa); +uint32_t vm_get_lowmem_limit(struct vmctx *ctx); +void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); +size_t vm_get_lowmem_size(struct vmctx *ctx); +size_t vm_get_highmem_size(struct vmctx *ctx); +int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access); +int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, + struct seg_desc *seg_desc); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_apicid2vcpu(struct vmctx *ctx, int apicid); +int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, + int errcode_valid, uint32_t errcode, int restart_instruction); +int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); +int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); +int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); +int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); +int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); +int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, + enum vm_intr_trigger trigger); +int vm_inject_nmi(struct vmctx *ctx, int vcpu); +int vm_capability_name2type(const char *capname); +const char *vm_capability_type2name(int type); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, + int func, uint64_t addr, uint64_t msg, int numvec); +int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, + int func, int idx, uint64_t addr, uint64_t msg, + uint32_t vector_control); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); +int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); + +int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); + +/* + * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. + * The 'iovcnt' should be big enough to accomodate all GPA segments. + * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. + */ +int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); +void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, + void *host_dst, size_t len); +void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, + struct iovec *guest_iov, size_t len); +void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, + int iovcnt); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); + +int vm_activate_cpu(struct vmctx *ctx, int vcpu); + +#ifdef __FreeBSD__ +/* + * FreeBSD specific APIs + */ +int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp); +int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, + uint32_t eip, uint32_t gdtbase, + uint32_t esp); +void vm_setup_freebsd_gdt(uint64_t *gdtr); +#endif +#endif /* _VMMAPI_H_ */ diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl new file mode 100644 index 0000000000..8ca5782feb --- /dev/null +++ b/usr/src/tools/scripts/gensetdefs.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl -w +# +# COPYRIGHT 2013 Pluribus Networks Inc. +# +# All rights reserved. This copyright notice is Copyright Management +# Information under 17 USC 1202 and is included to protect this work and +# deter copyright infringement. Removal or alteration of this Copyright +# Management Information without the express written permission from +# Pluribus Networks Inc is prohibited, and any such unauthorized removal +# or alteration will be a violation of federal law. + +use strict; + +my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`); + +foreach my $Section (@Sections) { + if ($Section =~ "^set_") { + print "\tfixing $Section\n"; + + chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`); + chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`); + my $SectionEnd = hex($SectionAddr) + hex($SectionSize); + + `elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`; + `elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`; + `elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`; + `elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`; + `elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`; + `elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`; + } +} diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c new file mode 100644 index 0000000000..40bdd80a6e --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -0,0 +1,1404 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <vm/seg_kmem.h> + +#include <sys/dls.h> +#include <sys/mac_client.h> + +#include <sys/viona_io.h> + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +/* + * Min. octets in an ethernet frame minus FCS + */ +#define MIN_BUF_SIZE 60 + +#define VIONA_NAME "Virtio Network Accelerator" + +#define VIONA_CTL_MINOR 0 +#define VIONA_CTL_NODE_NAME "ctl" + +#define VIONA_CLI_NAME "viona" + +#define VTNET_MAXSEGS 32 + +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +#define VRING_USED_F_NO_NOTIFY 1 + +#define BCM_NIC_DRIVER "bnxe" +/* + * Host capabilities + */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ + +#define VIONA_S_HOSTCAPS \ + (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_STATUS) + +#pragma pack(1) +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +}; +#pragma pack() + +#pragma pack(1) +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +}; +#pragma pack() + +#pragma pack(1) +struct virtio_net_mrgrxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +}; +struct virtio_net_hdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; +}; +#pragma pack() + +typedef struct viona_vring_hqueue { + /* Internal state */ + uint16_t hq_size; + kmutex_t hq_a_mutex; + kmutex_t hq_u_mutex; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + caddr_t hq_baseaddr; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +} viona_vring_hqueue_t; + + +typedef struct viona_link { + datalink_id_t l_linkid; + + struct vm *l_vm; + size_t l_vm_lomemsize; + caddr_t l_vm_lomemaddr; + size_t l_vm_himemsize; + caddr_t l_vm_himemaddr; + + mac_handle_t l_mh; + mac_client_handle_t l_mch; + + kmem_cache_t *l_desb_kmc; + + pollhead_t l_pollhead; + + viona_vring_hqueue_t l_rx_vring; + uint_t l_rx_intr; + + viona_vring_hqueue_t l_tx_vring; + kcondvar_t l_tx_cv; + uint_t l_tx_intr; + kmutex_t l_tx_mutex; + int l_tx_outstanding; + uint32_t l_features; +} viona_link_t; + +typedef struct { + frtn_t d_frtn; + viona_link_t *d_link; + uint_t d_ref; + uint16_t d_cookie; + int d_len; +} viona_desb_t; + +typedef struct viona_soft_state { + viona_link_t *ss_link; +} viona_soft_state_t; + +typedef struct used_elem { + uint16_t id; + uint32_t len; +} used_elem_t; + +static void *viona_state; +static dev_info_t *viona_dip; +static id_space_t *viona_minor_ids; +/* + * copy tx mbufs from virtio ring to avoid necessitating a wait + * for packet transmission to free resources. + */ +static boolean_t copy_tx_mblks = B_TRUE; + +extern struct vm *vm_lookup_by_name(char *name); +extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len); + +static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); +static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); +static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, + cred_t *credp, int *rval); +static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create); +static int viona_ioc_delete(viona_soft_state_t *ss); + +static int viona_vm_map(viona_link_t *link); +static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa); +static void viona_vm_unmap(viona_link_t *link); + +static int viona_ioc_rx_ring_init(viona_link_t *link, + vioc_ring_init_t *u_ri); +static int viona_ioc_tx_ring_init(viona_link_t *link, + vioc_ring_init_t *u_ri); +static int viona_ioc_rx_ring_reset(viona_link_t *link); +static int viona_ioc_tx_ring_reset(viona_link_t *link); +static void viona_ioc_rx_ring_kick(viona_link_t *link); +static void viona_ioc_tx_ring_kick(viona_link_t *link); +static int viona_ioc_rx_intr_clear(viona_link_t *link); +static int viona_ioc_tx_intr_clear(viona_link_t *link); + +static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback); +static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq); + +static struct cb_ops viona_cb_ops = { + viona_open, + viona_close, + nodev, + nodev, + nodev, + nodev, + nodev, + viona_ioctl, + nodev, + nodev, + nodev, + viona_chpoll, + ddi_prop_op, + 0, + D_MP | D_NEW | D_HOTPLUG, + CB_REV, + nodev, + nodev +}; + +static struct dev_ops viona_ops = { + DEVO_REV, + 0, + nodev, + nulldev, + nulldev, + viona_attach, + viona_detach, + nodev, + &viona_cb_ops, + NULL, + ddi_power, + ddi_quiesce_not_needed +}; + +static struct modldrv modldrv = { + &mod_driverops, + VIONA_NAME, + &viona_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + int ret; + + ret = ddi_soft_state_init(&viona_state, + sizeof (viona_soft_state_t), 0); + if (ret == 0) { + ret = mod_install(&modlinkage); + if (ret != 0) { + ddi_soft_state_fini(&viona_state); + return (ret); + } + } + + return (ret); +} + +int +_fini(void) +{ + int ret; + + ret = mod_remove(&modlinkage); + if (ret == 0) { + ddi_soft_state_fini(&viona_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static void +set_viona_tx_mode() +{ + major_t bcm_nic_major; + if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER)) + != DDI_MAJOR_T_NONE) { + if (ddi_hold_installed_driver(bcm_nic_major) != NULL) { + copy_tx_mblks = B_FALSE; + ddi_rele_driver(bcm_nic_major); + } + } +} + +static int +viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + viona_minor_ids = id_space_create("viona_minor_id", + VIONA_CTL_MINOR + 1, UINT16_MAX); + + if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME, + S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + viona_dip = dip; + + set_viona_tx_mode(); + ddi_report_dev(viona_dip); + + return (DDI_SUCCESS); +} + +static int +viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + id_space_destroy(viona_minor_ids); + + ddi_remove_minor_node(viona_dip, NULL); + + viona_dip = NULL; + + return (DDI_SUCCESS); +} + +static int +viona_open(dev_t *devp, int flag, int otype, cred_t *credp) +{ + int minor; + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + if (drv_priv(credp) != 0) { + return (EPERM); + } + + if (getminor(*devp) != VIONA_CTL_MINOR) { + return (ENXIO); + } + + minor = id_alloc(viona_minor_ids); + if (minor == 0) { + /* All minors are busy */ + return (EBUSY); + } + + if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { + id_free(viona_minor_ids, minor); + } + + *devp = makedevice(getmajor(*devp), minor); + + return (0); +} + +static int +viona_close(dev_t dev, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + if (drv_priv(credp) != 0) { + return (EPERM); + } + + minor = getminor(dev); + + ss = ddi_get_soft_state(viona_state, minor); + if (ss == NULL) { + return (ENXIO); + } + + viona_ioc_delete(ss); + + ddi_soft_state_free(viona_state, minor); + + id_free(viona_minor_ids, minor); + + return (0); +} + +static int +viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, + cred_t *credp, int *rval) +{ + viona_soft_state_t *ss; + int err = 0; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_CREATE: + err = viona_ioc_create(ss, (vioc_create_t *)data); + break; + case VNA_IOC_DELETE: + err = viona_ioc_delete(ss); + break; + case VNA_IOC_SET_FEATURES: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS; + break; + case VNA_IOC_GET_FEATURES: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + *(int *)data = VIONA_S_HOSTCAPS; + break; + case VNA_IOC_RX_RING_INIT: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_rx_ring_init(ss->ss_link, + (vioc_ring_init_t *)data); + break; + case VNA_IOC_RX_RING_RESET: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_rx_ring_reset(ss->ss_link); + break; + case VNA_IOC_RX_RING_KICK: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + viona_ioc_rx_ring_kick(ss->ss_link); + err = 0; + break; + case VNA_IOC_TX_RING_INIT: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_tx_ring_init(ss->ss_link, + (vioc_ring_init_t *)data); + break; + case VNA_IOC_TX_RING_RESET: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_tx_ring_reset(ss->ss_link); + break; + case VNA_IOC_TX_RING_KICK: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + viona_ioc_tx_ring_kick(ss->ss_link); + err = 0; + break; + case VNA_IOC_RX_INTR_CLR: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_rx_intr_clear(ss->ss_link); + break; + case VNA_IOC_TX_INTR_CLR: + if (ss->ss_link == NULL) { + return (ENOSYS); + } + err = viona_ioc_tx_intr_clear(ss->ss_link); + break; + default: + err = ENOTTY; + break; + } + + return (err); +} + +static int +viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + viona_soft_state_t *ss; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL || ss->ss_link == NULL) { + return (ENXIO); + } + + *reventsp = 0; + + if (ss->ss_link->l_rx_intr && (events & POLLIN)) { + *reventsp |= POLLIN; + } + + if (ss->ss_link->l_tx_intr && (events & POLLOUT)) { + *reventsp |= POLLOUT; + } + + if (*reventsp == 0 && !anyyet) { + *phpp = &ss->ss_link->l_pollhead; + } + + return (0); +} + +static int +viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create) +{ + vioc_create_t k_create; + viona_link_t *link; + char cli_name[MAXNAMELEN]; + int err; + + if (ss->ss_link != NULL) { + return (ENOSYS); + } + if (copyin(u_create, &k_create, sizeof (k_create)) != 0) { + return (EFAULT); + } + + link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); + + link->l_linkid = k_create.c_linkid; + link->l_vm = vm_lookup_by_name(k_create.c_vmname); + if (link->l_vm == NULL) { + err = ENXIO; + goto bail; + } + + link->l_vm_lomemsize = k_create.c_lomem_size; + link->l_vm_himemsize = k_create.c_himem_size; + err = viona_vm_map(link); + if (err != 0) { + goto bail; + } + + err = mac_open_by_linkid(link->l_linkid, &link->l_mh); + if (err != 0) { + cmn_err(CE_WARN, "viona create mac_open_by_linkid" + " returned %d\n", err); + goto bail; + } + + snprintf(cli_name, sizeof (cli_name), "%s-%d", + VIONA_CLI_NAME, link->l_linkid); + err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); + if (err != 0) { + cmn_err(CE_WARN, "viona create mac_client_open" + " returned %d\n", err); + goto bail; + } + + link->l_features = VIONA_S_HOSTCAPS; + link->l_desb_kmc = kmem_cache_create(cli_name, + sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL); + if (copy_tx_mblks) { + mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL); + cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL); + } + ss->ss_link = link; + + return (0); + +bail: + if (link->l_mch != NULL) { + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + + kmem_free(link, sizeof (viona_link_t)); + + return (err); +} + +static int +viona_ioc_delete(viona_soft_state_t *ss) +{ + viona_link_t *link; + + link = ss->ss_link; + if (link == NULL) { + return (ENOSYS); + } + if (copy_tx_mblks) { + mutex_enter(&link->l_tx_mutex); + while (link->l_tx_outstanding != 0) { + cv_wait(&link->l_tx_cv, &link->l_tx_mutex); + } + mutex_exit(&link->l_tx_mutex); + } + if (link->l_mch != NULL) { + mac_rx_clear(link->l_mch); + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + + viona_vm_unmap(link); + mutex_destroy(&link->l_tx_vring.hq_a_mutex); + mutex_destroy(&link->l_tx_vring.hq_u_mutex); + mutex_destroy(&link->l_rx_vring.hq_a_mutex); + mutex_destroy(&link->l_rx_vring.hq_u_mutex); + if (copy_tx_mblks) { + mutex_destroy(&link->l_tx_mutex); + cv_destroy(&link->l_tx_cv); + } + + kmem_cache_destroy(link->l_desb_kmc); + + kmem_free(link, sizeof (viona_link_t)); + + ss->ss_link = NULL; + + return (0); +} + +static caddr_t +viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len) +{ + caddr_t addr; + size_t offset; + pfn_t pfnum; + + if (len == 0) + return (NULL); + + addr = vmem_alloc(heap_arena, len, VM_SLEEP); + if (addr == NULL) + return (NULL); + + for (offset = 0; offset < len; offset += PAGESIZE) { + pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE)); + ASSERT(pfnum); + hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum, + PROT_READ | PROT_WRITE, HAT_LOAD_LOCK); + } + + return (addr); +} + +/* + * Map the guest physical address space into the kernel virtual address space. + */ +static int +viona_vm_map(viona_link_t *link) +{ + link->l_vm_lomemaddr = viona_mapin_vm_chunk(link, + 0, link->l_vm_lomemsize); + if (link->l_vm_lomemaddr == NULL) + return (-1); + link->l_vm_himemaddr = viona_mapin_vm_chunk(link, + 4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize); + if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL) + return (-1); + + return (0); +} + +/* + * Translate a guest physical address into a kernel virtual address. + */ +static caddr_t +viona_gpa2kva(viona_link_t *link, uint64_t gpa) +{ + if (gpa < link->l_vm_lomemsize) + return (link->l_vm_lomemaddr + gpa); + + gpa -= (4 * GB); + if (gpa < link->l_vm_himemsize) + return (link->l_vm_himemaddr + gpa); + + return (NULL); +} + +static void +viona_vm_unmap(viona_link_t *link) +{ + if (link->l_vm_lomemaddr) { + hat_unload(kas.a_hat, link->l_vm_lomemaddr, + link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK); + vmem_free(heap_arena, link->l_vm_lomemaddr, + link->l_vm_lomemsize); + } + if (link->l_vm_himemaddr) { + hat_unload(kas.a_hat, link->l_vm_himemaddr, + link->l_vm_himemsize, HAT_UNLOAD_UNLOCK); + vmem_free(heap_arena, link->l_vm_himemaddr, + link->l_vm_himemsize); + } +} + +static int +viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq, + vioc_ring_init_t *u_ri) +{ + vioc_ring_init_t k_ri; + + if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) { + return (EFAULT); + } + + hq->hq_size = k_ri.ri_qsize; + hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr); + if (hq->hq_baseaddr == NULL) + return (EINVAL); + + hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link, + k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc))); + if (hq->hq_avail_flags == NULL) + return (EINVAL); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + + hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link, + P2ROUNDUP(k_ri.ri_qaddr + + hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN))); + if (hq->hq_used_flags == NULL) + return (EINVAL); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; + + return (0); +} + +static int +viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri) +{ + viona_vring_hqueue_t *hq; + int rval; + + hq = &link->l_rx_vring; + + rval = viona_ioc_ring_init_common(link, hq, u_ri); + if (rval != 0) { + return (rval); + } + + return (0); +} + +static int +viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri) +{ + viona_vring_hqueue_t *hq; + + hq = &link->l_tx_vring; + + return (viona_ioc_ring_init_common(link, hq, u_ri)); +} + +static int +viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq) +{ + /* + * Reset all soft state + */ + hq->hq_cur_aidx = 0; + + return (0); +} + +static int +viona_ioc_rx_ring_reset(viona_link_t *link) +{ + viona_vring_hqueue_t *hq; + + mac_rx_clear(link->l_mch); + + hq = &link->l_rx_vring; + + return (viona_ioc_ring_reset_common(hq)); +} + +static int +viona_ioc_tx_ring_reset(viona_link_t *link) +{ + viona_vring_hqueue_t *hq; + + hq = &link->l_tx_vring; + + return (viona_ioc_ring_reset_common(hq)); +} + +static void +viona_ioc_rx_ring_kick(viona_link_t *link) +{ + viona_vring_hqueue_t *hq = &link->l_rx_vring; + + atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY); + + mac_rx_set(link->l_mch, viona_rx, link); +} + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static inline int +viona_hq_num_avail(viona_vring_hqueue_t *hq) +{ + uint16_t ndesc; + + /* + * We're just computing (a-b) in GF(216). + * + * The only glitch here is that in standard C, + * uint16_t promotes to (signed) int when int has + * more than 16 bits (pretty much always now), so + * we have to force it back to unsigned. + */ + ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx; + + ASSERT(ndesc <= hq->hq_size); + + return (ndesc); +} + +static void +viona_ioc_tx_ring_kick(viona_link_t *link) +{ + viona_vring_hqueue_t *hq = &link->l_tx_vring; + + do { + atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY); + while (viona_hq_num_avail(hq)) { + viona_tx(link, hq); + } + if (copy_tx_mblks) { + mutex_enter(&link->l_tx_mutex); + if (link->l_tx_outstanding != 0) { + cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex); + } + mutex_exit(&link->l_tx_mutex); + } + atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY); + } while (viona_hq_num_avail(hq)); +} + +static int +viona_ioc_rx_intr_clear(viona_link_t *link) +{ + link->l_rx_intr = 0; + + return (0); +} + +static int +viona_ioc_tx_intr_clear(viona_link_t *link) +{ + link->l_tx_intr = 0; + + return (0); +} +#define VQ_MAX_DESCRIPTORS 512 + +static int +vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov, +int n_iov, uint16_t *cookie) +{ + int i; + int ndesc, nindir; + int idx, head, next; + struct virtio_desc *vdir, *vindir, *vp; + + idx = hq->hq_cur_aidx; + ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx); + + if (ndesc == 0) + return (0); + if (ndesc > hq->hq_size) { + cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc); + return (-1); + } + + head = hq->hq_avail_ring[idx & (hq->hq_size - 1)]; + next = head; + + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= hq->hq_size) { + cmn_err(CE_NOTE, "descriptor index (%d)" + "out of range\n", next); + return (-1); + } + + vdir = (struct virtio_desc *)(hq->hq_baseaddr + + next * sizeof (struct virtio_desc)); + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + if (i > n_iov) + return (-1); + iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr); + if (iov[i].iov_base == NULL) { + cmn_err(CE_NOTE, "invalid guest physical" + " address 0x%"PRIx64"\n", vdir->vd_addr); + return (-1); + } + iov[i++].iov_len = vdir->vd_len; + } else { + nindir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || nindir == 0) { + cmn_err(CE_NOTE, "invalid indir len 0x%x\n", + vdir->vd_len); + return (-1); + } + vindir = (struct virtio_desc *) + viona_gpa2kva(link, vdir->vd_addr); + if (vindir == NULL) { + cmn_err(CE_NOTE, "invalid guest physical" + " address 0x%"PRIx64"\n", vdir->vd_addr); + return (-1); + } + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + cmn_err(CE_NOTE, "indirect desc" + " has INDIR flag\n"); + return (-1); + } + if (i > n_iov) + return (-1); + iov[i].iov_base = + viona_gpa2kva(link, vp->vd_addr); + if (iov[i].iov_base == NULL) { + cmn_err(CE_NOTE, "invalid guest" + " physical address 0x%"PRIx64"\n", + vp->vd_addr); + return (-1); + } + iov[i++].iov_len = vp->vd_len; + + if (i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + + next = vp->vd_next; + if (next >= nindir) { + cmn_err(CE_NOTE, "invalid next" + " %d > %d\n", next, nindir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) { + *cookie = head; + hq->hq_cur_aidx++; + return (i); + } + } + +loopy: + cmn_err(CE_NOTE, "%d > descriptor loop count\n", i); + + return (-1); +} + +static void +vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie) +{ + struct virtio_used *vu; + int uidx; + + uidx = *hq->hq_used_idx; + vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)]; + vu->vu_idx = cookie; + vu->vu_tlen = len; + membar_producer(); + *hq->hq_used_idx = uidx; +} + +static void +vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem) +{ + struct virtio_used *vu; + int uidx; + int i; + + uidx = *hq->hq_used_idx; + if (num_bufs == 1) { + vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)]; + vu->vu_idx = elem[0].id; + vu->vu_tlen = elem[0].len; + } else { + for (i = 0; i < num_bufs; i++) { + vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)]; + vu->vu_idx = elem[i].id; + vu->vu_tlen = elem[i].len; + } + uidx = uidx + num_bufs; + } + membar_producer(); + *hq->hq_used_idx = uidx; +} + +/* + * Copy bytes from mp to iov. + * copied_buf: Total num_bytes copied from mblk to iov array. + * buf: pointer to iov_base. + * i: index of iov array. Mainly used to identify if we are + * dealing with first iov array element. + * rxhdr_size: Virtio header size. Two possibilities in case + * of MRGRX buf, header has 2 additional bytes. + * In case of mrgrx, virtio header should be part of iov[0]. + * In case of non-mrgrx, virtio header may or may not be part + * of iov[0]. + */ +static int +copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov, + int i, int rxhdr_size) +{ + int copied_chunk = 0; + mblk_t *ml; + int total_buf_len = iov->iov_len; + /* + * iov[0] might have header, adjust + * total_buf_len accordingly + */ + if (i == 0) { + total_buf_len = iov->iov_len - rxhdr_size; + } + for (ml = mp; ml != NULL; ml = ml->b_cont) { + size_t chunk = MBLKL(ml); + /* + * If chunk is less than + * copied_buf we should move + * to correct msgblk + */ + if (copied_buf != 0) { + if (copied_buf < chunk) { + chunk -= copied_buf; + } else { + copied_buf -= chunk; + continue; + } + } + /* + * iov[0] already has virtio header. + * and if copied chunk is length of iov_len break + */ + if (copied_chunk == total_buf_len) { + break; + } + /* + * Sometimes chunk is total mblk len, sometimes mblk is + * divided into multiple chunks. + */ + if (chunk > copied_buf) { + if (chunk > copied_chunk) { + if ((chunk + copied_chunk) > total_buf_len) + chunk = (size_t)total_buf_len + - copied_chunk; + } else { + if (chunk > (total_buf_len - copied_chunk)) + chunk = (size_t)((total_buf_len + - copied_chunk) - chunk); + } + bcopy(ml->b_rptr + copied_buf, buf, chunk); + } else { + if (chunk > (total_buf_len - copied_chunk)) { + chunk = (size_t)(total_buf_len - copied_chunk); + } + bcopy(ml->b_rptr + copied_buf, buf, chunk); + } + buf += chunk; + copied_chunk += chunk; + } + return (copied_chunk); +} + +static void +viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + viona_link_t *link = arg; + viona_vring_hqueue_t *hq = &link->l_rx_vring; + mblk_t *mp0 = mp; + + while (viona_hq_num_avail(hq)) { + struct iovec iov[VTNET_MAXSEGS]; + size_t mblklen; + int n, i = 0; + uint16_t cookie; + struct virtio_net_hdr *vrx; + struct virtio_net_mrgrxhdr *vmrgrx; + mblk_t *ml; + caddr_t buf; + int total_len = 0; + int copied_buf = 0; + int num_bufs = 0; + int num_pops = 0; + used_elem_t uelem[VTNET_MAXSEGS]; + + if (mp == NULL) { + break; + } + mblklen = msgsize(mp); + if (mblklen == 0) { + break; + } + + mutex_enter(&hq->hq_a_mutex); + n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie); + mutex_exit(&hq->hq_a_mutex); + if (n <= 0) { + break; + } + num_pops++; + if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) { + int total_n = n; + int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr); + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base; + if (n == 1) { + buf = iov[0].iov_base + mrgrxhdr_size; + } + while (mblklen > copied_buf) { + if (total_n == i) { + mutex_enter(&hq->hq_a_mutex); + n = vq_popchain(link, hq, &iov[i], + VTNET_MAXSEGS, &cookie); + mutex_exit(&hq->hq_a_mutex); + if (n <= 0) { + freemsgchain(mp0); + return; + } + num_pops++; + total_n += n; + } + if (total_n > i) { + int copied_chunk = 0; + if (i != 0) { + buf = iov[i].iov_base; + } + copied_chunk = copy_in_mblk(mp, + copied_buf, buf, &iov[i], i, + mrgrxhdr_size); + copied_buf += copied_chunk; + uelem[i].id = cookie; + uelem[i].len = copied_chunk; + if (i == 0) { + uelem[i].len += mrgrxhdr_size; + } + } + num_bufs++; + i++; + } + } else { + boolean_t virt_hdr_incl_iov = B_FALSE; + int rxhdr_size = sizeof (struct virtio_net_hdr); + /* First element is header */ + vrx = (struct virtio_net_hdr *)iov[0].iov_base; + if (n == 1 || iov[0].iov_len > rxhdr_size) { + buf = iov[0].iov_base + rxhdr_size; + virt_hdr_incl_iov = B_TRUE; + total_len += rxhdr_size; + if (iov[0].iov_len < rxhdr_size) { + // Buff too small to fit pkt. Drop it. + freemsgchain(mp0); + return; + } + } else { + total_len = iov[0].iov_len; + } + if (iov[0].iov_len == rxhdr_size) + i++; + while (mblklen > copied_buf) { + if (n > i) { + int copied_chunk = 0; + if (i != 0) { + buf = iov[i].iov_base; + } + /* + * In case of non-mrgrx buf, first + * descriptor always has header and + * rest of the descriptors have data. + * But it is not guaranteed that first + * descriptor will only have virtio + * header. It might also have data. + */ + if (virt_hdr_incl_iov) { + copied_chunk = copy_in_mblk(mp, + copied_buf, buf, &iov[i], + i, rxhdr_size); + } else { + copied_chunk = copy_in_mblk(mp, + copied_buf, buf, &iov[i], + i, 0); + } + copied_buf += copied_chunk; + total_len += copied_chunk; + } else { + /* + * Drop packet as it cant fit + * in buf provided by guest. + */ + freemsgchain(mp0); + return; + } + i++; + } + } + /* + * The only valid field in the rx packet header is the + * number of buffers, which is always 1 without TSO + * support. + */ + if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) { + memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr)); + vmrgrx->vrh_bufs = num_bufs; + /* + * Make sure iov[0].iov_len >= MIN_BUF_SIZE + * otherwise guest will consider it as invalid frame. + */ + if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) { + uelem[0].len = MIN_BUF_SIZE; + } + /* + * Release this chain and handle more chains. + */ + mutex_enter(&hq->hq_u_mutex); + vq_pushchain_mrgrx(hq, num_pops, uelem); + mutex_exit(&hq->hq_u_mutex); + } else { + memset(vrx, 0, sizeof (struct virtio_net_hdr)); + if (total_len < MIN_BUF_SIZE) { + total_len = MIN_BUF_SIZE; + } + /* + * Release this chain and handle more chains. + */ + mutex_enter(&hq->hq_u_mutex); + vq_pushchain(hq, total_len, cookie); + mutex_exit(&hq->hq_u_mutex); + } + + mp = mp->b_next; + } + + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) { + pollwakeup(&link->l_pollhead, POLLIN); + } + } + + freemsgchain(mp0); +} + +static void +viona_desb_free(viona_desb_t *dp) +{ + viona_link_t *link; + viona_vring_hqueue_t *hq; + struct virtio_used *vu; + int uidx; + uint_t ref; + + ref = atomic_dec_uint_nv(&dp->d_ref); + if (ref != 0) + return; + + link = dp->d_link; + hq = &link->l_tx_vring; + + mutex_enter(&hq->hq_u_mutex); + vq_pushchain(hq, dp->d_len, dp->d_cookie); + mutex_exit(&hq->hq_u_mutex); + + kmem_cache_free(link->l_desb_kmc, dp); + + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) { + pollwakeup(&link->l_pollhead, POLLOUT); + } + } + if (copy_tx_mblks) { + mutex_enter(&link->l_tx_mutex); + if (--link->l_tx_outstanding == 0) { + cv_broadcast(&link->l_tx_cv); + } + mutex_exit(&link->l_tx_mutex); + } +} + +static void +viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq) +{ + struct iovec iov[VTNET_MAXSEGS]; + uint16_t cookie; + int i, n; + mblk_t *mp_head, *mp_tail, *mp; + viona_desb_t *dp; + mac_client_handle_t link_mch = link->l_mch; + + mp_head = mp_tail = NULL; + + mutex_enter(&hq->hq_a_mutex); + n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie); + mutex_exit(&hq->hq_a_mutex); + ASSERT(n != 0); + + dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP); + dp->d_frtn.free_func = viona_desb_free; + dp->d_frtn.free_arg = (void *)dp; + dp->d_link = link; + dp->d_cookie = cookie; + + dp->d_ref = 0; + dp->d_len = iov[0].iov_len; + + for (i = 1; i < n; i++) { + dp->d_ref++; + dp->d_len += iov[i].iov_len; + if (copy_tx_mblks) { + mp = desballoc((uchar_t *)iov[i].iov_base, + iov[i].iov_len, BPRI_MED, &dp->d_frtn); + ASSERT(mp); + } else { + mp = allocb(iov[i].iov_len, BPRI_MED); + ASSERT(mp); + bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr, + iov[i].iov_len); + } + mp->b_wptr += iov[i].iov_len; + if (mp_head == NULL) { + ASSERT(mp_tail == NULL); + mp_head = mp; + } else { + ASSERT(mp_tail != NULL); + mp_tail->b_cont = mp; + } + mp_tail = mp; + } + if (copy_tx_mblks == B_FALSE) { + viona_desb_free(dp); + } + if (copy_tx_mblks) { + mutex_enter(&link->l_tx_mutex); + link->l_tx_outstanding++; + mutex_exit(&link->l_tx_mutex); + } + mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); +} diff --git a/usr/src/uts/i86pc/io/viona/viona.conf b/usr/src/uts/i86pc/io/viona/viona.conf new file mode 100644 index 0000000000..e66488531a --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2013 Pluribus Networks Inc. +# + +name="viona" parent="pseudo"; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c new file mode 100644 index 0000000000..6b62daae6c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c @@ -0,0 +1,271 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/smp.h> + +#include <machine/vmm.h> +#ifdef __FreeBSD__ +#include "io/iommu.h" +#endif + +static int +amdv_init(void) +{ + + printf("amdv_init: not implemented\n"); + return (ENXIO); +} + +static int +amdv_cleanup(void) +{ + + printf("amdv_cleanup: not implemented\n"); + return (ENXIO); +} + +static void * +amdv_vminit(struct vm *vm) +{ + + printf("amdv_vminit: not implemented\n"); + return (NULL); +} + +static int +amdv_vmrun(void *arg, int vcpu, register_t rip) +{ + + printf("amdv_vmrun: not implemented\n"); + return (ENXIO); +} + +static void +amdv_vmcleanup(void *arg) +{ + + printf("amdv_vmcleanup: not implemented\n"); + return; +} + +static int +amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t spok) +{ + + printf("amdv_vmmmap_set: not implemented\n"); + return (EINVAL); +} + +static vm_paddr_t +amdv_vmmmap_get(void *arg, vm_paddr_t gpa) +{ + + printf("amdv_vmmmap_get: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) +{ + + printf("amdv_getreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) +{ + + printf("amdv_setreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getcap(void *arg, int vcpu, int type, int *retval) +{ + + printf("amdv_getcap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setcap(void *arg, int vcpu, int type, int val) +{ + + printf("amdv_setcap: not implemented\n"); + return (EINVAL); +} + +struct vmm_ops vmm_ops_amd = { + amdv_init, + amdv_cleanup, + amdv_vminit, + amdv_vmrun, + amdv_vmcleanup, + amdv_vmmmap_set, + amdv_vmmmap_get, + amdv_getreg, + amdv_setreg, + amdv_getdesc, + amdv_setdesc, + amdv_getcap, + amdv_setcap +}; + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + +#ifdef __FreeBSD__ +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_remove_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, + amd_iommu_invalidate_tlb, +}; +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c new file mode 100644 index 0000000000..5ae9ed2f6a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c @@ -0,0 +1,452 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/param.h> +#include <machine/cpufunc.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "ept.h" + +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PG_RD (1 << 0) +#define EPT_PG_WR (1 << 1) +#define EPT_PG_EX (1 << 2) +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) +#define EPT_PG_IGNORE_PAT (1 << 6) +#define EPT_PG_SUPERPAGE (1 << 7) + +#define EPT_ADDR_MASK ((uint64_t)-1 << 12) + +MALLOC_DECLARE(M_VMX); + +static uint64_t page_sizes_mask; + +/* + * Set this to 1 to have the EPT tables respect the guest PAT settings + */ +static int ept_pat_passthru; + +int +ept_init(void) +{ + int page_shift; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + /* Set bits in 'page_sizes_mask' for each valid page size */ + page_shift = PAGE_SHIFT; + page_sizes_mask = 1UL << page_shift; /* 4KB page */ + + page_shift += 9; + if (EPT_PDE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + + page_shift += 9; + if (EPT_PDPTE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + + return (0); +} + +#if 0 +static void +ept_dump(uint64_t *ptp, int nlevels) +{ + int i, t, tabs; + uint64_t *ptpnext, ptpval; + + if (--nlevels < 0) + return; + + tabs = 3 - nlevels; + for (t = 0; t < tabs; t++) + printf("\t"); + printf("PTP = %p\n", ptp); + + for (i = 0; i < 512; i++) { + ptpval = ptp[i]; + + if (ptpval == 0) + continue; + + for (t = 0; t < tabs; t++) + printf("\t"); + printf("%3d 0x%016lx\n", i, ptpval); + + if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { + ptpnext = (uint64_t *) + PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + ept_dump(ptpnext, nlevels); + } + } +} +#endif + +static size_t +ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, vm_prot_t prot, boolean_t spok) +{ + int spshift, ptpshift, ptpindex, nlevels; + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - super page sizes supported by the processor + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = PAGE_SHIFT; + if (spok) + spshift += (EPT_PWLEVELS - 1) * 9; + while (spshift >= PAGE_SHIFT) { + uint64_t spsize = 1UL << spshift; + if ((page_sizes_mask & spsize) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + length >= spsize) { + break; + } + spshift -= 9; + } + + if (spshift < PAGE_SHIFT) { + panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " + "length 0x%016lx, page_sizes_mask 0x%016lx", + gpa, hpa, length, page_sizes_mask); + } + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) + break; + + /* + * We are working on a non-leaf page table page. + * + * Create the next level page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { +#ifdef __FreeBSD__ + void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); +#else + void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP); + ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0); +#endif + ptp[ptpindex] = vtophys(nlp); + ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; + } + + /* Work our way down to the next level page table page */ +#ifdef __FreeBSD__ + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); +#else + ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK)); +#endif + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) { + panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " + "mismatch\n", gpa, ptpshift); + } + + if (prot != VM_PROT_NONE) { + /* Do the mapping */ + ptp[ptpindex] = hpa; + + /* Apply the access controls */ + if (prot & VM_PROT_READ) + ptp[ptpindex] |= EPT_PG_RD; + if (prot & VM_PROT_WRITE) + ptp[ptpindex] |= EPT_PG_WR; + if (prot & VM_PROT_EXECUTE) + ptp[ptpindex] |= EPT_PG_EX; + + /* + * By default the PAT type is ignored - this appears to + * be how other hypervisors handle EPT. Allow this to be + * overridden. + */ + ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); + if (!ept_pat_passthru) + ptp[ptpindex] |= EPT_PG_IGNORE_PAT; + + if (nlevels > 0) + ptp[ptpindex] |= EPT_PG_SUPERPAGE; + } else { + /* Remove the mapping */ + ptp[ptpindex] = 0; + } + + return (1UL << ptpshift); +} + +static vm_paddr_t +ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) +{ + int nlevels, ptpshift, ptpindex; + uint64_t ptpval, hpabase, pgmask; + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + ptpval = ptp[ptpindex]; + + /* Cannot make progress beyond this point */ + if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) + break; + + if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { + pgmask = (1UL << ptpshift) - 1; + hpabase = ptpval & ~pgmask; + return (hpabase | (gpa & pgmask)); + } + + /* Work our way down to the next level page table page */ +#ifdef __FreBSD__ + ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); +#else + ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK)); +#endif + } + + return ((vm_paddr_t)-1); +} + +static void +ept_free_pt_entry(pt_entry_t pte) +{ + if (pte == 0) + return; + + /* sanity check */ + if ((pte & EPT_PG_SUPERPAGE) != 0) + panic("ept_free_pt_entry: pte cannot have superpage bit"); + + return; +} + +static void +ept_free_pd_entry(pd_entry_t pde) +{ + pt_entry_t *pt; + int i; + + if (pde == 0) + return; + + if ((pde & EPT_PG_SUPERPAGE) == 0) { +#ifdef __FreeBSD__ + pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); + for (i = 0; i < NPTEPG; i++) + ept_free_pt_entry(pt[i]); + free(pt, M_VMX); /* free the page table page */ +#else + page_t *pp; + pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK)); + for (i = 0; i < NPTEPG; i++) + ept_free_pt_entry(pt[i]); + pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK)); + kmem_free((void *)pp->p_offset, PAGE_SIZE); +#endif + } +} + +static void +ept_free_pdp_entry(pdp_entry_t pdpe) +{ + pd_entry_t *pd; + int i; + + if (pdpe == 0) + return; + + if ((pdpe & EPT_PG_SUPERPAGE) == 0) { +#ifdef __FreeBSD__ + pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); + for (i = 0; i < NPDEPG; i++) + ept_free_pd_entry(pd[i]); + free(pd, M_VMX); /* free the page directory page */ +#else + page_t *pp; + pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK)); + for (i = 0; i < NPDEPG; i++) + ept_free_pd_entry(pd[i]); + pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK)); + kmem_free((void *)pp->p_offset, PAGE_SIZE); +#endif + } +} + +static void +ept_free_pml4_entry(pml4_entry_t pml4e) +{ + pdp_entry_t *pdp; + int i; + + if (pml4e == 0) + return; + + if ((pml4e & EPT_PG_SUPERPAGE) == 0) { +#ifdef __FreeBSD__ + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); + for (i = 0; i < NPDPEPG; i++) + ept_free_pdp_entry(pdp[i]); + free(pdp, M_VMX); /* free the page directory ptr page */ +#else + page_t *pp; + pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e + & EPT_ADDR_MASK)); + for (i = 0; i < NPDPEPG; i++) + ept_free_pdp_entry(pdp[i]); + pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK)); + kmem_free((void *)pp->p_offset, PAGE_SIZE); +#endif + } +} + +void +ept_vmcleanup(struct vmx *vmx) +{ + int i; + + for (i = 0; i < NPML4EPG; i++) + ept_free_pml4_entry(vmx->pml4ept[i]); +} + +int +ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, + vm_memattr_t attr, int prot, boolean_t spok) +{ + size_t n; + struct vmx *vmx = arg; + + while (len > 0) { + n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, + prot, spok); + len -= n; + gpa += n; + hpa += n; + } + + return (0); +} + +vm_paddr_t +ept_vmmmap_get(void *arg, vm_paddr_t gpa) +{ + vm_paddr_t hpa; + struct vmx *vmx; + + vmx = arg; + hpa = ept_lookup_mapping(vmx->pml4ept, gpa); + return (hpa); +} + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long pml4ept) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = EPTP(pml4ept); + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h new file mode 100644 index 0000000000..d0bcce7ec3 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) + +int ept_init(void); +int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); +vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa); +void ept_invalidate_mappings(u_long ept_pml4); +void ept_vmcleanup(struct vmx *vmx); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c new file mode 100644 index 0000000000..bbd2da2a34 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c @@ -0,0 +1,597 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifdef __FreeBSD__ +#include "opt_ddb.h" +#endif + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/segments.h> +#include <machine/vmm.h> +#include "vmm_host.h" +#include "vmx_cpufunc.h" +#include "vmcs.h" +#include "ept.h" +#include "vmx.h" + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + if (!running) + VMPTRLD(vmcs); + + error = vmread(encoding, retval); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(encoding, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +#ifndef __FreeBSD__ +int +vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Host MSRs are loaded from the VM-exit MSR-load area. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} +#endif + +int +vmcs_set_defaults(struct vmcs *vmcs, + u_long host_rip, u_long host_rsp, u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t eptp, pat, fsbase, idtrbase; + uint32_t exc_bitmap; + + codesel = vmm_get_host_codesel(); + datasel = vmm_get_host_datasel(); + tsssel = vmm_get_host_tsssel(); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* + * Load the VMX controls + */ + if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) + goto done; + + /* Guest state */ + + /* Initialize guest IA32_PAT MSR with the default value */ + pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) + goto done; + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = vmm_get_host_pat(); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = vmm_get_host_efer(); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + + cr0 = vmm_get_host_cr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = vmm_get_host_cr4() | CR4_VMXE; + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + +#ifdef __FreeBSD__ + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; +#else + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel())) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel())) != 0) + goto done; +#endif + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + +#ifdef __FreeBSD__ + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + fsbase = vmm_get_host_fsbase(); + if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) + goto done; +#endif + + idtrbase = vmm_get_host_idtrbase(); + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) + goto done; + + /* instruction pointer */ + if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) + goto done; + + /* stack pointer */ + if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) + goto done; + + /* eptp */ + eptp = EPTP(ept_pml4); + if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) + goto done; + + /* vpid */ + if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + goto done; + + /* msr bitmap */ + if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) + goto done; + + /* exception bitmap */ + exc_bitmap = 1 << IDT_MC; + if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) + goto done; + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +#ifdef DDB +extern int vmxon_enabled[]; + +DB_SHOW_COMMAND(vmcs, db_show_vmcs) +{ + uint64_t cur_vmcs, val; + uint32_t exit; + + if (!vmxon_enabled[curcpu]) { + db_printf("VMX not enabled\n"); + return; + } + + if (have_addr) { + db_printf("Only current VMCS supported\n"); + return; + } + + vmptrst(&cur_vmcs); + if (cur_vmcs == VMCS_INITIAL) { + db_printf("No current VM context\n"); + return; + } + db_printf("VMCS: %jx\n", cur_vmcs); + db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); + db_printf("Activity: "); + val = vmcs_read(VMCS_GUEST_ACTIVITY); + switch (val) { + case 0: + db_printf("Active"); + break; + case 1: + db_printf("HLT"); + break; + case 2: + db_printf("Shutdown"); + break; + case 3: + db_printf("Wait for SIPI"); + break; + default: + db_printf("Unknown: %#lx", val); + } + db_printf("\n"); + exit = vmcs_read(VMCS_EXIT_REASON); + if (exit & 0x80000000) + db_printf("Entry Failure Reason: %u\n", exit & 0xffff); + else + db_printf("Exit Reason: %u\n", exit & 0xffff); + db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); + db_printf("Guest Linear Address: %#lx\n", + vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); + switch (exit & 0x8000ffff) { + case EXIT_REASON_EXCEPTION: + case EXIT_REASON_EXT_INTR: + val = vmcs_read(VMCS_EXIT_INTR_INFO); + db_printf("Interrupt Type: "); + switch (val >> 8 & 0x7) { + case 0: + db_printf("external"); + break; + case 2: + db_printf("NMI"); + break; + case 3: + db_printf("HW exception"); + break; + case 4: + db_printf("SW exception"); + break; + default: + db_printf("?? %lu", val >> 8 & 0x7); + break; + } + db_printf(" Vector: %lu", val & 0xff); + if (val & 0x800) + db_printf(" Error Code: %lx", + vmcs_read(VMCS_EXIT_INTR_ERRCODE)); + db_printf("\n"); + break; + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + db_printf("Guest Physical Address: %#lx\n", + vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); + break; + } + db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h new file mode 100644 index 0000000000..20e99e8184 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h @@ -0,0 +1,410 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $ + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_init(struct vmcs *vmcs); +#ifndef __FreeBSD__ +int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count); +#endif +int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, + u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, + uint16_t vpid); +int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); +int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); + +/* + * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h + */ +#ifdef _VMX_CPUFUNC_H_ +static __inline uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error)); + return (val); +} + +static __inline void +vmcs_write(uint32_t encoding, uint64_t val) +{ + int error; + + error = vmwrite(encoding, val); + KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); +} +#endif /* _VMX_CPUFUNC_H_ */ + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) + +#endif /* _KERNEL */ + +#define VMCS_INITIAL 0xffffffffffffffff + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1 << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0 << 8) +#define VMCS_INTR_T_NMI (2 << 8) +#define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) +#define VMCS_INTR_DEL_ERRCODE (1 << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c new file mode 100644 index 0000000000..7ddf4e2a46 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -0,0 +1,2842 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/segments.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> +#include "vmm_lapic.h" +#include "vmm_host.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" +#include "vmx_controls.h" + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_LOAD_EFER | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_SAVE_PAT | \ + VM_EXIT_LOAD_PAT) + +#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS + +#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT) + +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define HANDLED 1 +#define UNHANDLED 0 + +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); + +int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#ifndef __FreeBSD__ +static vm_paddr_t vmxon_region_pa[MAXCPU]; +#endif + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, + &cr0_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, + &cr0_zeros_mask, 0, NULL); + +static uint64_t cr4_ones_mask, cr4_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, + &cr4_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, + &cr4_zeros_mask, 0, NULL); + +static int vmx_initialized; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, + &vmx_initialized, 0, "Intel VMX initialized"); + +/* + * Optional capabilities + */ +static int cap_halt_exit; +static int cap_pause_exit; +static int cap_unrestricted_guest; +static int cap_monitor_trap; +static int cap_invpcid; + +static int virtual_interrupt_delivery; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, + &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); + +static int posted_interrupts; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, + &posted_interrupts, 0, "APICv posted interrupt support"); + +static int pirvec; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, + &pirvec, 0, "APICv posted interrupt vector"); + +static struct unrhdr *vpid_unr; +static u_int vpid_alloc_failed; +SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, + &vpid_alloc_failed, 0, NULL); + +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +#define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); +static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); +static void vmx_inject_pir(struct vlapic *vlapic); + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE: + return "mce"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} + +#ifdef SETJMP_TRACE +static const char * +vmx_setjmp_rc2str(int rc) +{ + switch (rc) { + case VMX_RETURN_DIRECT: + return "direct"; + case VMX_RETURN_LONGJMP: + return "longjmp"; + case VMX_RETURN_VMRESUME: + return "vmresume"; + case VMX_RETURN_VMLAUNCH: + return "vmlaunch"; + case VMX_RETURN_AST: + return "ast"; + default: + return "unknown"; + } +} + +#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ + VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ + (vmxctx)->regname) + +static void +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + uint64_t host_rip, host_rsp; + + if (vmxctx != &vmx->ctx[vcpu]) + panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", + vmxctx, &vmx->ctx[vcpu]); + + VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); + VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", + vmx_setjmp_rc2str(rc), rc); + + host_rsp = host_rip = ~0; + vmread(VMCS_HOST_RIP, &host_rip); + vmread(VMCS_HOST_RSP, &host_rsp); + VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", + host_rip, host_rsp); + + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); + + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); +} +#endif +#else +static void __inline +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + return; +} +#endif /* KTR */ + +static int +vmx_allow_x2apic_msrs(struct vmx *vmx) +{ + int i, error; + + error = 0; + + /* + * Allow readonly access to the following x2APIC MSRs from the guest. + */ + error += guest_msr_ro(vmx, MSR_APIC_ID); + error += guest_msr_ro(vmx, MSR_APIC_VERSION); + error += guest_msr_ro(vmx, MSR_APIC_LDR); + error += guest_msr_ro(vmx, MSR_APIC_SVR); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); + + error += guest_msr_ro(vmx, MSR_APIC_ESR); + error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); + error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); + error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); + error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_ICR); + + /* + * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. + * + * These registers get special treatment described in the section + * "Virtualizing MSR-Based APIC Accesses". + */ + error += guest_msr_rw(vmx, MSR_APIC_TPR); + error += guest_msr_rw(vmx, MSR_APIC_EOI); + error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); + + return (error); +} + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +vpid_free(int vpid) +{ + if (vpid < 0 || vpid > 0xffff) + panic("vpid_free: invalid vpid %d", vpid); + + /* + * VPIDs [0,VM_MAXCPU] are special and are not allocated from + * the unit number allocator. + */ + + if (vpid > VM_MAXCPU) + free_unr(vpid_unr, vpid); +} + +static void +vpid_alloc(uint16_t *vpid, int num) +{ + int i, x; + + if (num <= 0 || num > VM_MAXCPU) + panic("invalid number of vpids requested: %d", num); + + /* + * If the "enable vpid" execution control is not enabled then the + * VPID is required to be 0 for all vcpus. + */ + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { + for (i = 0; i < num; i++) + vpid[i] = 0; + return; + } + + /* + * Allocate a unique VPID for each vcpu from the unit number allocator. + */ + for (i = 0; i < num; i++) { + x = alloc_unr(vpid_unr); + if (x == -1) + break; + else + vpid[i] = x; + } + + if (i < num) { + atomic_add_int(&vpid_alloc_failed, 1); + + /* + * If the unit number allocator does not have enough unique + * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. + * + * These VPIDs are not be unique across VMs but this does not + * affect correctness because the combined mappings are also + * tagged with the EP4TA which is unique for each VM. + * + * It is still sub-optimal because the invvpid will invalidate + * combined mappings for a particular VPID across all EP4TAs. + */ + while (i-- > 0) + vpid_free(vpid[i]); + + for (i = 0; i < num; i++) + vpid[i] = i + 1; + } +} + +static void +vpid_init(void) +{ + /* + * VPID 0 is required when the "enable VPID" execution control is + * disabled. + * + * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the + * unit number allocator does not have sufficient unique VPIDs to + * satisfy the allocation. + * + * The remaining VPIDs are managed by the unit number allocator. + */ + vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); +} + +#ifndef __FreeBSD__ +static void +msr_save_area_init(struct msr_entry *g_area, int *g_count) +{ + int cnt; + + static struct msr_entry guest_msrs[] = { + { MSR_KGSBASE, 0, 0 }, + { MSR_LSTAR, 0, 0 }, + { MSR_CSTAR, 0, 0 }, + { MSR_STAR, 0, 0 }, + { MSR_SF_MASK, 0, 0 }, + }; + + cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); + if (cnt > GUEST_MSR_MAX_ENTRIES) + panic("guest msr save area overrun"); + bcopy(guest_msrs, g_area, sizeof(guest_msrs)); + *g_count = cnt; +} + +static void +host_msr_save_area_init(struct msr_entry *h_area, int *h_count) +{ + int i, cnt; + + static struct msr_entry host_msrs[] = { + { MSR_LSTAR, 0, 0 }, + { MSR_CSTAR, 0, 0 }, + { MSR_STAR, 0, 0 }, + { MSR_SF_MASK, 0, 0 }, + }; + + cnt = sizeof(host_msrs) / sizeof(host_msrs[0]); + if (cnt > HOST_MSR_MAX_ENTRIES) + panic("host msr save area overrun"); + for (i = 0; i < cnt; i++) { + host_msrs[i].val = rdmsr(host_msrs[i].index); + } + bcopy(host_msrs, h_area, sizeof(host_msrs)); + *h_count = cnt; +} +#endif + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + +#ifdef __FreeBSD__ + if (pirvec != 0) + vmm_ipi_free(pirvec); +#endif + + if (vpid_unr != NULL) { + delete_unrhdr(vpid_unr); + vpid_unr = NULL; + } + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + uint64_t feature_control; + + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + wrmsr(MSR_IA32_FEATURE_CONTROL, + feature_control | IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK); + } + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); +#ifdef __FreeBSD__ + error = vmxon(vmxon_region[curcpu]); +#else + error = vmxon_pa(vmxon_region_pa[curcpu]); + ASSERT(error == 0); +#endif + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static int +vmx_init(void) +{ +#define X86FSET_VMX 35 + extern uchar_t x86_featureset[]; + extern boolean_t is_x86_feature(void *featureset, uint_t feature); + int error; + uint64_t fixed0, fixed1, feature_control; + uint32_t tmp; +#ifndef __FreeBSD__ + int i; +#endif + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ +#ifdef __FreeBSD__ + if (!(cpu_feature2 & CPUID2_VMX)) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } +#else + if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { + cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n"); + } +#endif + + /* + * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits + * are set (bits 0 and 2 respectively). + */ + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + printf("vmx_init: VMX operation disabled by BIOS\n"); + return (ENXIO); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + /* Initialize EPT */ + error = ept_init(); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + +#ifndef __FreeBSD__ + for (i = 0; i < MAXCPU; i++) { + vmxon_region_pa[i] = vtophys(&vmxon_region[i]); + } +#endif + + vpid_init(); + + vmx_msr_init(); + + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + vmx_initialized = 1; + + return (0); +} + +static int +vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) +{ + int error, mask_ident, shadow_ident; + uint64_t mask_value; + + if (which != 0 && which != 4) + panic("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = cr0_ones_mask | cr0_zeros_mask; + shadow_ident = VMCS_CR0_SHADOW; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + } + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); + if (error) + return (error); + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); + if (error) + return (error); + + return (0); +} +#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) +#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) + +static void * +vmx_vminit(struct vm *vm) +{ + uint16_t vpid[VM_MAXCPU]; + int i, error, guest_msr_count; +#ifndef __FreeBSD__ + int host_msr_count; +#endif + struct vmx *vmx; + struct vmcs *vmcs; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vtophys(vmx->pml4ept)); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * MSR_PAT is saved and restored in the guest VMCS are on a VM exit + * and entry respectively. It is also restored from the host VMCS + * area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as + * that will impact the host TSC. If the guest does a write + * the "use TSC offsetting" execution control is enabled and the + * difference between the host TSC and the guest TSC is written + * into the TSC offset in the VMCS. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_rw(vmx, MSR_PAT) || + guest_msr_ro(vmx, MSR_TSC)) + panic("vmx_vminit: error setting guest msr access"); + + vpid_alloc(vpid, VM_MAXCPU); + + for (i = 0; i < VM_MAXCPU; i++) { + vmcs = &vmx->vmcs[i]; + vmcs->identifier = vmx_revision(); + error = vmclear(vmcs); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vmx_msr_guest_init(vmx, i); + + error = vmcs_set_defaults(vmcs, + (u_long)vmx_longjmp, + (u_long)&vmx->ctx[i], + vtophys(vmx->pml4ept), + pinbased_ctls, + procbased_ctls, + procbased_ctls2, + exit_ctls, entry_ctls, + vtophys(vmx->msr_bitmap), + vpid[i]); + + if (error != 0) + panic("vmx_vminit: vmcs_set_defaults error %d", error); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + + vmx->state[i].lastcpu = -1; + vmx->state[i].vpid = vpid[i]; + +#ifndef __FreeBSD__ + msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + + error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), + guest_msr_count); + if (error != 0) + panic("vmcs_set_msr_save error %d", error); + + host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count); + + error = vmcs_set_host_msr_save(&vmx->vmcs[i], + vtophys(vmx->host_msrs[i]), + host_msr_count); + if (error != 0) + panic("vmcs_set_msr_save error %d", error); +#endif + + /* + * Set up the CR0/4 shadows, and init the read shadow + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + error = vmx_setup_cr0_shadow(vmcs, 0x60000010); + if (error != 0) + panic("vmx_setup_cr0_shadow %d", error); + + error = vmx_setup_cr4_shadow(vmcs, 0); + if (error != 0) + panic("vmx_setup_cr4_shadow %d", error); + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) +{ + int handled, func; + + func = vmxctx->guest_rax; + + handled = x86_emulate_cpuid(vm, vcpu, + (uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), + (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled) +{ +#ifdef KTR + VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); +#endif +} + +static __inline void +vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); +#endif +} + +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +{ + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc = { 0 }; +#ifndef __FreeBSD__ + desctbr_t idtr, gdtr; +#endif + + vmxstate = &vmx->state[vcpu]; + vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase()); + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + +#ifndef __FreeBSD__ + vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR)); + vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); + vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR)); +#endif + + /* + * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (vmxstate->vpid != 0) { + invvpid_desc.vpid = vmxstate->vpid; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + } +} + +static void +vm_exit_update_rip(struct vm_exit *vmexit) +{ + int error; + + error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); + if (error) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); + } +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + +#ifdef __FreeBSD__ + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); +#else + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls)); +#endif + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + } +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + +#ifdef __FreeBSD__ + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); +#else + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls)); +#endif + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); +} + +int +vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) +{ + int error; + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); + } + + error = vmwrite(VMCS_TSC_OFFSET, offset); + + return (error); +} + +#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +static void +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + uint32_t gi, info; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); +#ifdef __FreeBSD__ + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %#x", gi)); +#else + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %x", gi)); +#endif + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); +#ifdef __FreeBSD__ + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %#x", info)); +#else + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %x", info)); +#endif + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vm_nmi_clear(vmx->vm, vcpu); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { +#ifdef __FreeBSD__ + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); +#else + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %lx", __func__, entryinfo)); +#endif + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); +#ifdef __FreeBSD__ + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); +#else + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %lx/%x", __func__, entryinfo, info)); +#endif + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + need_nmi_exiting = 1; + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if ((info & VMCS_INTR_VALID) == 0) { + vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) + vmx_set_nmi_window_exiting(vmx, vcpu); + } + + extint_pending = vm_extint_pending(vmx->vm, vcpu); + +#ifdef __FreeBSD__ + if (!extint_pending && virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + return; + } +#endif + + /* + * If interrupt-window exiting is already in effect then don't bother + * checking for pending interrupts. This is just an optimization and + * not needed for correctness. + */ + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { + VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " + "pending int_window_exiting"); + return; + } + + if (!extint_pending) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if (info & VMCS_INTR_VALID) { + /* + * This is expected and could happen for multiple reasons: + * - A vectoring VM-entry was aborted due to astpending + * - A VM-exit happened during event injection. + * - An exception was injected above. + * - An NMI was injected above or after "NMI window exiting" + */ + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (!extint_pending) { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static uint64_t +vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) +{ + const struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + return (vmxctx->guest_rax); + case 1: + return (vmxctx->guest_rcx); + case 2: + return (vmxctx->guest_rdx); + case 3: + return (vmxctx->guest_rbx); + case 4: + return (vmcs_read(VMCS_GUEST_RSP)); + case 5: + return (vmxctx->guest_rbp); + case 6: + return (vmxctx->guest_rsi); + case 7: + return (vmxctx->guest_rdi); + case 8: + return (vmxctx->guest_r8); + case 9: + return (vmxctx->guest_r9); + case 10: + return (vmxctx->guest_r10); + case 11: + return (vmxctx->guest_r11); + case 12: + return (vmxctx->guest_r12); + case 13: + return (vmxctx->guest_r13); + case 14: + return (vmxctx->guest_r14); + case 15: + return (vmxctx->guest_r15); + default: + panic("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) +{ + struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + vmxctx->guest_rax = regval; + break; + case 1: + vmxctx->guest_rcx = regval; + break; + case 2: + vmxctx->guest_rdx = regval; + break; + case 3: + vmxctx->guest_rbx = regval; + break; + case 4: + vmcs_write(VMCS_GUEST_RSP, regval); + break; + case 5: + vmxctx->guest_rbp = regval; + break; + case 6: + vmxctx->guest_rsi = regval; + break; + case 7: + vmxctx->guest_rdi = regval; + break; + case 8: + vmxctx->guest_r8 = regval; + break; + case 9: + vmxctx->guest_r9 = regval; + break; + case 10: + vmxctx->guest_r10 = regval; + break; + case 11: + vmxctx->guest_r11 = regval; + break; + case 12: + vmxctx->guest_r12 = regval; + break; + case 13: + vmxctx->guest_r13 = regval; + break; + case 14: + vmxctx->guest_r14 = regval; + break; + case 15: + vmxctx->guest_r15 = regval; + break; + default: + panic("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + vmcs_write(VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entry_ctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(VMCS_GUEST_IA32_EFER, efer); + entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); + entry_ctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vmx->vm, vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vmx, vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(void) +{ + uint32_t csar; + + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(void) +{ + + if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +static uint64_t +inout_str_index(struct vmx *vmx, int vcpuid, int in) +{ + uint64_t val; + int error; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vmx, vcpuid, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx *vmx, int vcpuid, int rep) +{ + uint64_t val; + int error; + + if (rep) { + error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +{ + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + vie_init(&vmexit->u.inst_emul.vie, NULL, 0); +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = VM_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = VM_PROT_EXECUTE; + else + fault_type= VM_PROT_READ; + + return (fault_type); +} + +static boolean_t +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (FALSE); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (FALSE); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (FALSE); + } + + return (TRUE); +} + +static int +emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); + else + error = vmx_wrmsr(vmx, vcpuid, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) +{ + struct vmxctx *vmxctx; + uint64_t result; + uint32_t eax, edx; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); + else + error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); + + if (error == 0) { + eax = result; + vmxctx = &vmx->ctx[vcpuid]; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); + KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); + + edx = result >> 32; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); + KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); + } + + return (error); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int error, handled, in; + struct vmcs *vmcs; + struct vmxctx *vmxctx; + struct vm_inout_str *vis; + uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info; + uint64_t qual, gla, gpa, cr3; + bool retu; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + + switch (vmexit->u.vmx.exit_reason) { + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vmx, vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vmx, vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); + retu = false; + ecx = vmxctx->guest_rcx; + VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + error = emulate_rdmsr(vmx, vcpu, ecx, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_WRMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); + retu = false; + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", + ecx, (uint64_t)edx << 32 | eax); + error = emulate_wrmsr(vmx, vcpu, ecx, + (uint64_t)edx << 32 | eax, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + vmexit->exitcode = VM_EXITCODE_MTRAP; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + vmx_clear_int_window_exiting(vmx, vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); +#ifdef __FreeBSD__ + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %#x", intr_info)); +#else + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %x", intr_info)); +#endif +#if 0 /* XXX */ + vmx_trigger_hostintr(intr_info & 0xff); +#endif + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vmx->vm, vcpu)) + vmx_inject_nmi(vmx, vcpu); + vmx_clear_nmi_window_exiting(vmx, vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + if (vmexit->u.inout.string) { + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging); + vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(VMCS_GUEST_CR0); + vis->index = inout_str_index(vmx, vcpu, in); + vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + inout_str_seginfo(vmx, vcpu, inst_info, in, vis); + } + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); +#ifdef __FreeBSD__ + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); +#else + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %x", intr_info)); +#endif + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_info & 0xff) != IDT_DF && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) + return (1); + break; + case EXIT_REASON_EPT_FAULT: + gpa = vmcs_gpa(); + if (ept_emulation_fault(qual)) { + vmexit_inst_emul(vmexit, gpa, vmcs_gla()); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + } + break; + default: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vm_exit_update_rip(vmexit); + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + + /* + * Special case for spinning up an AP - exit to userspace to + * give the controlling process a chance to intercept and + * spin up a thread for the AP. + */ + if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) + handled = 0; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip) +{ + int error, vie, rc, handled, astpending; + uint32_t exit_reason; + struct vmx *vmx; + struct vm *vm; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + struct vm_exit *vmexit; + struct vlapic *vlapic; + + vmx = arg; + vm = vmx->vm; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + vlapic = vm_lapic(vm, vcpu); + vmxctx->launched = 0; + + astpending = 0; + vmexit = vm_exitinfo(vmx->vm, vcpu); + + vmx_msr_guest_enter(vmx, vcpu); + + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmcs_set_defaults(). + */ + vmcs_write(VMCS_HOST_CR3, rcr3()); + + vmcs_write(VMCS_GUEST_RIP, rip); + vmx_set_pcpu_defaults(vmx, vcpu); + do { + vmx_inject_interrupts(vmx, vcpu, vlapic); + vmx_run_trace(vmx, vcpu); + rc = vmx_setjmp(vmxctx); +#ifdef SETJMP_TRACE + vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); +#endif + switch (rc) { + case VMX_RETURN_DIRECT: + if (vmxctx->launched == 0) { + vmxctx->launched = 1; + vmx_launch(vmxctx); + } else + vmx_resume(vmxctx); + panic("vmx_launch/resume should not return"); + break; + case VMX_RETURN_LONGJMP: + break; /* vm exit */ + case VMX_RETURN_AST: + astpending = 1; + break; + case VMX_RETURN_VMRESUME: + vie = vmcs_instruction_error(); + if (vmxctx->launch_error == VM_FAIL_INVALID || + vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { + printf("vmresume error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); + goto err_exit; + } + vmx_launch(vmxctx); /* try to launch the guest */ + panic("vmx_launch should not return"); + break; + case VMX_RETURN_VMLAUNCH: + vie = vmcs_instruction_error(); +#if 1 + printf("vmlaunch error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); +#endif + goto err_exit; + default: + panic("vmx_setjmp returned %d", rc); + } + + /* collect some basic information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = rip; + + /* enable interrupts */ + enable_intr(); + + if (astpending) { + handled = 1; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmx_astpending_trace(vmx, vcpu, rip); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); + break; + } + + handled = vmx_exit_process(vmx, vcpu, vmexit); + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); + + } while (handled); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + if (!handled) + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); + + VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d", + vmexit->exitcode); + + VMCLEAR(vmcs); + vmx_msr_guest_exit(vmx, vcpu); + + return (0); + +err_exit: + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.exit_reason = (uint32_t)-1; + vmexit->u.vmx.exit_qualification = (uint32_t)-1; + vmexit->u.vmx.status = ~0; + VMCLEAR(vmcs); + vmx_msr_guest_exit(vmx, vcpu); + + return (ENOEXEC); +} + +static void +vmx_vmcleanup(void *arg) +{ + int i, error; + struct vmx *vmx = arg; + + for (i = 0; i < VM_MAXCPU; i++) + vpid_free(vmx->state[i].vpid); + + /* + * XXXSMP we also need to clear the VMCS active on the other vcpus. + */ + error = vmclear(&vmx->vmcs[0]); + if (error != 0) + panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + + ept_vmcleanup(vmx); + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_shadow_reg(int reg) +{ + int shreg; + + shreg = -1; + + switch (reg) { + case VM_REG_GUEST_CR0: + shreg = VMCS_CR0_SHADOW; + break; + case VM_REG_GUEST_CR4: + shreg = VMCS_CR4_SHADOW; + break; + default: + break; + } + + return (shreg); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + int running, hostcpu; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error, hostcpu, running, shadow; + uint64_t ctls; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + + shadow = vmx_shadow_reg(reg); + if (shadow > 0) { + /* + * Store the unmodified value in the shadow + */ + error = vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(shadow), val); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + baseval = procbased_ctls2; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx *vmx; +}; + +#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ +do { \ + VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ + level ? "level" : "edge", vector); \ + VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ + VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ + VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ + VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ + VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ +} while (0) + +/* + * vlapic->ops handlers that utilize the APICv hardware assist described in + * Chapter 29 of the Intel SDM. + */ +static int +vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint64_t mask; + int idx, notify; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + /* + * Keep track of interrupt requests in the PIR descriptor. This is + * because the virtual APIC page pointed to by the VMCS cannot be + * modified if the vcpu is running. + */ + idx = vector / 64; + mask = 1UL << (vector % 64); + atomic_set_long(&pir_desc->pir[idx], mask); + notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + + VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, + level, "vmx_set_intr_ready"); + return (notify); +} + +static int +vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t pending, pirval; + uint32_t ppr, vpr; + int i; + + /* + * This function is only expected to be called from the 'HLT' exit + * handler which does not care about the vector that is pending. + */ + KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + pending = atomic_load_acq_long(&pir_desc->pending); + if (!pending) + return (0); /* common case */ + + /* + * If there is an interrupt pending then it will be recognized only + * if its priority is greater than the processor priority. + * + * Special case: if the processor priority is zero then any pending + * interrupt will be recognized. + */ + lapic = vlapic->apic_page; + ppr = lapic->ppr & 0xf0; + if (ppr == 0) + return (1); + + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", + lapic->ppr); + + for (i = 3; i >= 0; i--) { + pirval = pir_desc->pir[i]; + if (pirval != 0) { + vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; + return (vpr > ppr); + } + } + return (0); +} + +static void +vmx_intr_accepted(struct vlapic *vlapic, int vector) +{ + + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmx *vmx; + struct vmcs *vmcs; + uint64_t mask, val; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), + ("vmx_set_tmr: vcpu cannot be running")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vmx = vlapic_vtx->vmx; + vmcs = &vmx->vmcs[vlapic->vcpuid]; + mask = 1UL << (vector % 64); + + VMPTRLD(vmcs); + val = vmcs_read(VMCS_EOI_EXIT(vector)); + if (level) + val |= mask; + else + val &= ~mask; + vmcs_write(VMCS_EOI_EXIT(vector), val); + VMCLEAR(vmcs); +} + +static void +vmx_post_intr(struct vlapic *vlapic, int hostcpu) +{ + + ipi_cpu(hostcpu, pirvec); +} + +/* + * Transfer the pending interrupts in the PIR descriptor to the IRR + * in the virtual APIC page. + */ +static void +vmx_inject_pir(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t val, pirval; + int rvi, pirbase = -1; + uint16_t intr_status_old, intr_status_new; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "no posted interrupt pending"); + return; + } + + pirval = 0; + pirbase = -1; + lapic = vlapic->apic_page; + + val = atomic_readandclear_long(&pir_desc->pir[0]); + if (val != 0) { + lapic->irr0 |= val; + lapic->irr1 |= val >> 32; + pirbase = 0; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[1]); + if (val != 0) { + lapic->irr2 |= val; + lapic->irr3 |= val >> 32; + pirbase = 64; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[2]); + if (val != 0) { + lapic->irr4 |= val; + lapic->irr5 |= val >> 32; + pirbase = 128; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[3]); + if (val != 0) { + lapic->irr6 |= val; + lapic->irr7 |= val >> 32; + pirbase = 192; + pirval = val; + } + + VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); + + /* + * Update RVI so the processor can evaluate pending virtual + * interrupts on VM-entry. + * + * It is possible for pirval to be 0 here, even though the + * pending bit has been set. The scenario is: + * CPU-Y is sending a posted interrupt to CPU-X, which + * is running a guest and processing posted interrupts in h/w. + * CPU-X will eventually exit and the state seen in s/w is + * the pending bit set, but no PIR bits set. + * + * CPU-X CPU-Y + * (vm running) (host running) + * rx posted interrupt + * CLEAR pending bit + * SET PIR bit + * READ/CLEAR PIR bits + * SET pending bit + * (vm exit) + * pending bit set, PIR 0 + */ + if (pirval != 0) { + rvi = pirbase + flsl(pirval) - 1; + intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + intr_status_new = (intr_status_old & 0xFF00) | rvi; + if (intr_status_new > intr_status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "guest_intr_status changed from 0x%04x to 0x%04x", + intr_status_old, intr_status_new); + } + } +} + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; + vlapic_vtx->vmx = vmx; + + if (virtual_interrupt_delivery) { + vlapic->ops.set_intr_ready = vmx_set_intr_ready; + vlapic->ops.pending_intr = vmx_pending_intr; + vlapic->ops.intr_accepted = vmx_intr_accepted; + vlapic->ops.set_tmr = vmx_set_tmr; +#ifdef __FreeBSD__ + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; +#endif + } + + if (posted_interrupts) + vlapic->ops.post_intr = vmx_post_intr; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + ept_vmmmap_set, + ept_vmmmap_get, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_getcap, + vmx_setcap, + vmx_vlapic_init, + vmx_vlapic_cleanup, +}; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h new file mode 100644 index 0000000000..50ca62b371 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -0,0 +1,156 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $ + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +#ifndef __FreeBSD__ +#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ +#define HOST_MSR_MAX_ENTRIES 64 /* arbitrary */ +#endif + +struct vmxctx { + register_t tmpstk[32]; /* vmx_return() stack */ + register_t tmpstktop; + + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + register_t host_rip; + /* + * XXX todo debug registers and fpu state + */ + + int launched; /* vmcs launch state */ + int launch_error; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; +}; + +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +/* virtual machine softc */ +struct vmx { + pml4_entry_t pml4ept[NPML4EPG]; + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ + char msr_bitmap[PAGE_SIZE]; + struct pir_desc pir_desc[VM_MAXCPU]; +#ifdef __FreeBSD__ + uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; +#else + struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; + struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES]; +#endif + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + struct vm *vm; +}; +CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); + +#define VMX_RETURN_DIRECT 0 +#define VMX_RETURN_LONGJMP 1 +#define VMX_RETURN_VMRESUME 2 +#define VMX_RETURN_VMLAUNCH 3 +#define VMX_RETURN_AST 4 +/* + * vmx_setjmp() returns: + * - 0 when it returns directly + * - 1 when it returns from vmx_longjmp + * - 2 when it returns from vmx_resume (which would only be in the error case) + * - 3 when it returns from vmx_launch (which would only be in the error case) + * - 4 when it returns from vmx_resume or vmx_launch because of AST pending + */ +int vmx_setjmp(struct vmxctx *ctx); +void vmx_longjmp(void); /* returns via vmx_setjmp */ +void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h new file mode 100644 index 0000000000..08b1469f19 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h new file mode 100644 index 0000000000..9513f6c70b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,245 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon_pa(vm_paddr_t addr) +{ + int error; + + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +vmxoff(void) +{ + + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + + __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %[val], %[reg];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [val] "r" (val), [reg] "r" (reg) + : "memory"); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %[r], %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [r] "r" (r), [addr] "m" (*addr) + : "memory"); + + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invept error %d", error); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c new file mode 100644 index 0000000000..1ced311ca8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c @@ -0,0 +1,445 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuset.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#ifndef __FreeBSD__ +#include <vm/pmap.h> +#endif + +#include "vmx.h" +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; +static uint64_t host_msrs[GUEST_MSR_NUM]; + +static bool +nehalem_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Nehalem microarchitecture + * are documented in Section 35.5, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x1A: + case 0x1E: + case 0x1F: + case 0x2E: + return (true); + default: + break; + } + } + return (false); +} + +static bool +westmere_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Westmere microarchitecture + * are documented in Section 35.6, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x25: + case 0x2C: + return (true); + default: + break; + } + } + return (false); +} + +void +vmx_msr_init(void) +{ + uint64_t bus_freq, ratio; + int i; + +#ifdef __FreeBSD__ + /* + * It is safe to cache the values of the following MSRs because + * they don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif + + /* + * Initialize emulated MSRs + */ + misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1 << 12) | (1 << 11); + misc_enable &= ~((1 << 18) | (1 << 16)); + + if (nehalem_cpu() || westmere_cpu()) + bus_freq = 133330000; /* 133Mhz */ + else + bus_freq = 100000000; /* 100Mhz */ + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; +} + +void +vmx_msr_guest_init(struct vmx *vmx, int vcpuid) +{ + /* + * The permissions bitmap is shared between all vcpus so initialize it + * once when initializing the vBSP. + */ + if (vcpuid == 0) { + guest_msr_rw(vmx, MSR_LSTAR); + guest_msr_rw(vmx, MSR_CSTAR); + guest_msr_rw(vmx, MSR_STAR); + guest_msr_rw(vmx, MSR_SF_MASK); + guest_msr_rw(vmx, MSR_KGSBASE); + } + return; +} + +void +vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) +{ +#ifdef __FreeBSD__ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + /* Save host MSRs (if any) and restore guest MSRs */ + wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); + wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); +#endif +} + +void +vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) +{ +#ifdef __FreeBSD__ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + /* Save guest MSRs */ + guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); + + /* Restore host MSRs */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +#endif +} + +int +vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *val = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + *val = 0; + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + uint64_t changed; + int error; + + error = 0; + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(vmx->vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + break; /* Ignore writes */ + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) + error = EINVAL; + + break; + case MSR_TSC: + error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc()); + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h new file mode 100644 index 0000000000..5300d14d9b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +struct vmx; + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid); +int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu); +int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu); + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define guest_msr_ro(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s new file mode 100644 index 0000000000..d57dde1093 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -0,0 +1,271 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <machine/asmacros.h> + +#include "vmx_assym.s" + +/* + * Disable interrupts before updating %rsp in VMX_CHECK_AST or + * VMX_GUEST_RESTORE. + * + * The location that %rsp points to is a 'vmxctx' and not a + * real stack so we don't want an interrupt handler to trash it + */ +#define VMX_DISABLE_INTERRUPTS cli + +/* + * If the thread hosting the vcpu has an ast pending then take care of it + * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST. + * + * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts + * are disabled. + */ +#ifdef __FreeBSD__ +#define VMX_CHECK_AST \ + movq PCPU(CURTHREAD),%rax; \ + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \ + je 9f; \ + movq $VMX_RETURN_AST,%rsi; \ + movq %rdi,%rsp; \ + addq $VMXCTX_TMPSTKTOP,%rsp; \ + callq vmx_return; \ +9: +#else +#define VMX_CHECK_AST \ + movq %gs:CPU_THREAD,%rax; \ + movl T_ASTFLAG(%rax),%eax; \ + test %al,%al; \ + je 9f; \ + movq $VMX_RETURN_AST,%rsi; \ + movq %rdi,%rsp; \ + addq $VMXCTX_TMPSTKTOP,%rsp; \ + callq vmx_return; \ +9: +#endif + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. + */ +#define VMX_GUEST_RESTORE \ + movq %rdi,%rsp; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VM_INSTRUCTION_ERROR(reg) \ + jnc 1f; \ + movl $VM_FAIL_INVALID,reg; /* CF is set */ \ + jmp 3f; \ +1: jnz 2f; \ + movl $VM_FAIL_VALID,reg; /* ZF is set */ \ + jmp 3f; \ +2: movl $VM_SUCCESS,reg; \ +3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) + + .text +/* + * int vmx_setjmp(ctxp) + * %rdi = ctxp + * + * Return value is '0' when it returns directly from here. + * Return value is '1' when it returns after a vm exit through vmx_longjmp. + */ +ENTRY(vmx_setjmp) + movq (%rsp),%rax /* return address */ + movq %r15,VMXCTX_HOST_R15(%rdi) + movq %r14,VMXCTX_HOST_R14(%rdi) + movq %r13,VMXCTX_HOST_R13(%rdi) + movq %r12,VMXCTX_HOST_R12(%rdi) + movq %rbp,VMXCTX_HOST_RBP(%rdi) + movq %rsp,VMXCTX_HOST_RSP(%rdi) + movq %rbx,VMXCTX_HOST_RBX(%rdi) + movq %rax,VMXCTX_HOST_RIP(%rdi) + + /* + * XXX save host debug registers + */ + movl $VMX_RETURN_DIRECT,%eax + ret +END(vmx_setjmp) + +/* + * void vmx_return(struct vmxctx *ctxp, int retval) + * %rdi = ctxp + * %rsi = retval + * Return to vmm context through vmx_setjmp() with a value of 'retval'. + */ +ENTRY(vmx_return) + /* Restore host context. */ + movq VMXCTX_HOST_R15(%rdi),%r15 + movq VMXCTX_HOST_R14(%rdi),%r14 + movq VMXCTX_HOST_R13(%rdi),%r13 + movq VMXCTX_HOST_R12(%rdi),%r12 + movq VMXCTX_HOST_RBP(%rdi),%rbp + movq VMXCTX_HOST_RSP(%rdi),%rsp + movq VMXCTX_HOST_RBX(%rdi),%rbx + movq VMXCTX_HOST_RIP(%rdi),%rax + movq %rax,(%rsp) /* return address */ + + /* + * XXX restore host debug registers + */ + movl %esi,%eax + ret +END(vmx_return) + +/* + * void vmx_longjmp(void) + * %rsp points to the struct vmxctx + */ +ENTRY(vmx_longjmp) + /* + * Save guest state that is not automatically saved in the vmcs. + */ + movq %rdi,VMXCTX_GUEST_RDI(%rsp) + movq %rsi,VMXCTX_GUEST_RSI(%rsp) + movq %rdx,VMXCTX_GUEST_RDX(%rsp) + movq %rcx,VMXCTX_GUEST_RCX(%rsp) + movq %r8,VMXCTX_GUEST_R8(%rsp) + movq %r9,VMXCTX_GUEST_R9(%rsp) + movq %rax,VMXCTX_GUEST_RAX(%rsp) + movq %rbx,VMXCTX_GUEST_RBX(%rsp) + movq %rbp,VMXCTX_GUEST_RBP(%rsp) + movq %r10,VMXCTX_GUEST_R10(%rsp) + movq %r11,VMXCTX_GUEST_R11(%rsp) + movq %r12,VMXCTX_GUEST_R12(%rsp) + movq %r13,VMXCTX_GUEST_R13(%rsp) + movq %r14,VMXCTX_GUEST_R14(%rsp) + movq %r15,VMXCTX_GUEST_R15(%rsp) + + movq %cr2,%rdi + movq %rdi,VMXCTX_GUEST_CR2(%rsp) + + movq %rsp,%rdi + movq $VMX_RETURN_LONGJMP,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp + callq vmx_return +END(vmx_longjmp) + +/* + * void vmx_resume(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 2. + */ +ENTRY(vmx_resume) + VMX_DISABLE_INTERRUPTS + + VMX_CHECK_AST + + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmresume + + /* + * Capture the reason why vmresume failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ + movq %rsp,%rdi + movq $VMX_RETURN_VMRESUME,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp + callq vmx_return +END(vmx_resume) + +/* + * void vmx_launch(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 3. + */ +ENTRY(vmx_launch) + VMX_DISABLE_INTERRUPTS + + VMX_CHECK_AST + + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmlaunch + + /* + * Capture the reason why vmlaunch failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ + movq %rsp,%rdi + movq $VMX_RETURN_VMLAUNCH,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp + callq vmx_return +END(vmx_launch) diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c new file mode 100644 index 0000000000..a93b252c91 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -0,0 +1,809 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <x86/apicreg.h> +#include <dev/ic/i8259.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vioapic.h" +#include "vatpic.h" + +static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); + +#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) +#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) +#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +struct atpic { + bool ready; + int icw_num; + int rd_cmd_reg; + + bool aeoi; + bool poll; + bool rotate; + bool sfn; /* special fully-nested mode */ + + int irq_base; + uint8_t request; /* Interrupt Request Register (IIR) */ + uint8_t service; /* Interrupt Service (ISR) */ + uint8_t mask; /* Interrupt Mask Register (IMR) */ + uint8_t smm; /* special mask mode */ + + int acnt[8]; /* sum of pin asserts and deasserts */ + int lowprio; /* lowest priority irq */ + + bool intr_raised; +}; + +struct vatpic { + struct vm *vm; + struct mtx mtx; + struct atpic atpic[2]; + uint8_t elc[2]; +}; + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->service & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->smm && (atpic->mask & bit) != 0) + continue; + else + return (pin); + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->service; + if (atpic->sfn) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->smm) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + vatpic_set_pinstate(vatpic, 2, true); + vatpic_set_pinstate(vatpic, 2, false); + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); + + atpic->ready = false; + + atpic->icw_num = 1; + atpic->request = 0; + atpic->mask = 0; + atpic->lowprio = 7; + atpic->rd_cmd_reg = 0; + atpic->poll = 0; + atpic->smm = 0; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); + + atpic->irq_base = val & 0xf8; + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); + + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + if ((val & ICW4_AEOI) != 0) + atpic->aeoi = true; + + if ((val & ICW4_SFNM) != 0) { + if (master_atpic(vatpic, atpic)) { + atpic->sfn = true; + } else { + VATPIC_CTR1(vatpic, "Ignoring special fully nested " + "mode on slave atpic: %#x", val); + } + } + + atpic->icw_num = 0; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); + + atpic->mask = val & 0xff; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); + + atpic->rotate = ((val & OCW2_R) != 0); + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->service &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); + + if (val & OCW3_ESMM) { + atpic->smm = val & OCW3_SMM ? 1 : 0; + VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", + master_atpic(vatpic, atpic) ? "master" : "slave", + atpic->smm ? "enabled" : "disabled"); + } + + if (val & OCW3_RR) { + /* read register command */ + atpic->rd_cmd_reg = val & OCW3_RIS; + + /* Polling mode */ + atpic->poll = ((val & OCW3_P) != 0); + } + + return (0); +} + +static void +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + int oldcnt, newcnt; + bool level; + + KASSERT(pin >= 0 && pin < 16, + ("vatpic_set_pinstate: invalid pin number %d", pin)); + KASSERT(VATPIC_LOCKED(vatpic), + ("vatpic_set_pinstate: vatpic is not locked")); + + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = atpic->acnt[pin & 0x7]; + if (newstate) + atpic->acnt[pin & 0x7]++; + else + atpic->acnt[pin & 0x7]--; + newcnt = atpic->acnt[pin & 0x7]; + + if (newcnt < 0) { + VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); + } + + level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); + + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); + atpic->request |= (1 << (pin & 0x7)); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); + if (level) + atpic->request &= ~(1 << (pin & 0x7)); + } else { + VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", + pin, newstate ? "asserted" : "deasserted", newcnt); + } + + vatpic_notify_intr(vatpic); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (atpic->ready == false) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + vatpic_set_pinstate(vatpic, irq, true); + vatpic_set_pinstate(vatpic, irq, false); + break; + default: + panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + struct vatpic *vatpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comment in vatpic_elc_handler. These IRQs must be + * edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + if (trigger == LEVEL_TRIGGER) + vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); + else + vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->request &= ~(1 << pin); + + if (atpic->aeoi == true) { + if (atpic->rotate == true) + atpic->lowprio = pin; + } else { + atpic->service |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int pin; + + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = 0; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | pin; + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->mask; + } else { + if (atpic->rd_cmd_reg == OCW3_RIS) { + /* read interrupt service register */ + *eax = atpic->service; + } else { + /* read interrupt request register */ + *eax = atpic->request; + } + } + } + + VATPIC_UNLOCK(vatpic); + + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = *eax; + + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_num) { + case 2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case 3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case 4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + bool is_master; + + vatpic = vm_atpic(vm); + is_master = (port == IO_ELCR1); + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + + if (in) { + if (is_master) + *eax = vatpic->elc[0]; + else + *eax = vatpic->elc[1]; + } else { + /* + * For the master PIC the cascade channel (IRQ2), the + * heart beat timer (IRQ0), and the keyboard + * controller (IRQ1) cannot be programmed for level + * mode. + * + * For the slave PIC the real time clock (IRQ8) and + * the floating point error interrupt (IRQ13) cannot + * be programmed for level mode. + */ + if (is_master) + vatpic->elc[0] = (*eax & 0xf8); + else + vatpic->elc[1] = (*eax & 0xde); + } + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); + vatpic->vm = vm; + + mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + free(vatpic, M_VATPIC); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h new file mode 100644 index 0000000000..ef5e51b158 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $ + */ + +#ifndef _VATPIC_H_ +#define _VATPIC_H_ + +#include <isa/isareg.h> + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); + +#endif /* _VATPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c new file mode 100644 index 0000000000..ce17bdc92c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -0,0 +1,458 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vatpit.h" + +static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); + +#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) +#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) +#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + + +struct channel { + int mode; + uint16_t initial; /* initial counter value */ + sbintime_t now_sbt; /* uptime when counter was loaded */ + uint8_t cr[2]; + uint8_t ol[2]; + bool slatched; /* status latched */ + uint8_t status; + int crbyte; + int olbyte; + int frbyte; + struct callout callout; + sbintime_t callout_sbt; /* target time */ + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + struct mtx mtx; + + sbintime_t freq_sbt; + + struct channel channel[3]; +}; + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + sbintime_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + out = ((c->initial - delta_ticks) <= 0); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); + return; +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c; + sbintime_t now, delta, precision; + + c = &vatpit->channel[0]; + if (c->initial != 0) { + delta = c->initial * vatpit->freq_sbt; + precision = delta >> tc_precexp; + c->callout_sbt = c->callout_sbt + delta; + + /* + * Reset 'callout_sbt' if the time that the callout + * was supposed to fire is more than 'c->initial' + * ticks in the past. + */ + now = sbinuptime(); + if (c->callout_sbt < now) + c->callout_sbt = now + delta; + + callout_reset_sbt(&c->callout, c->callout_sbt, + precision, vatpit_callout_handler, &c->callout_arg, + C_ABSOLUTE); + } +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + sbintime_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olbyte != 0) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an o/s bug - reading the value of + * the timer without having set up the initial value. + * + * The original user-space version of this code set + * the timer to 100hz in this condition; do the same + * here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + c->now_sbt = sbinuptime(); + c->status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { + (void) pit_update_counter(vatpit, c, true); + } + + if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->status |= TIMER_STS_OUT; + else + c->status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw, mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) + pit_update_counter(vatpit, c, true); + else { + c->mode = mode; + c->olbyte = 0; /* reset latch after reprogramming */ + c->status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + struct channel *c; + uint8_t val; + int error; + + vatpit = vm_atpit(vm); + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* + * Return the status byte if latched + */ + *eax = c->status; + c->slatched = false; + c->status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (c->olbyte == 0) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->frbyte) + tmp >>= 8; + tmp &= 0xff; + *eax = tmp; + c->frbyte ^= 1; + } else + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->status &= ~TIMER_STS_NULLCNT; + c->frbyte = 0; + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + c->now_sbt = sbinuptime(); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->callout_sbt = c->now_sbt; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + + vatpit = vm_atpit(vm); + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct bintime bt; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); + vatpit->vm = vm; + + mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); + + FREQ2BT(PIT_8254_FREQ, &bt); + vatpit->freq_sbt = bttosbt(bt); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, true); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + free(vatpit, M_VATPIT); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h new file mode 100644 index 0000000000..f20ad73e47 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $ + */ + +#ifndef _VATPIT_H_ +#define _VATPIT_H_ + +#include <machine/timerreg.h> + +#define NMISC_PORT 0x61 + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); + +#endif /* _VATPIT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.c b/usr/src/uts/i86pc/io/vmm/io/vdev.c new file mode 100644 index 0000000000..0f835625f3 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vdev.c @@ -0,0 +1,282 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include "vdev.h" + +struct vdev { + SLIST_ENTRY(vdev) entry; + struct vdev_ops *ops; + void *dev; +}; +static SLIST_HEAD(, vdev) vdev_head; +static int vdev_count; + +struct vdev_region { + SLIST_ENTRY(vdev_region) entry; + struct vdev_ops *ops; + void *dev; + struct io_region *io; +}; +static SLIST_HEAD(, vdev_region) region_head; +static int region_count; + +static MALLOC_DEFINE(M_VDEV, "vdev", "vdev"); + +#define VDEV_INIT (0) +#define VDEV_RESET (1) +#define VDEV_HALT (2) + +// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"}; + +static int +vdev_system_event(int event) +{ + struct vdev *vd; + int rc; + + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name); + switch (event) { + case VDEV_INIT: + rc = vd->ops->init(vd->dev); + break; + case VDEV_RESET: + rc = vd->ops->reset(vd->dev); + break; + case VDEV_HALT: + rc = vd->ops->halt(vd->dev); + break; + default: + break; + } + if (rc) { + printf("vdev %s init failed rc=%d\n", + vd->ops->name, rc); + return rc; + } + } + return 0; +} + +int +vdev_init(void) +{ + return vdev_system_event(VDEV_INIT); +} + +int +vdev_reset(void) +{ + return vdev_system_event(VDEV_RESET); +} + +int +vdev_halt(void) +{ + return vdev_system_event(VDEV_HALT); +} + +void +vdev_vm_init(void) +{ + SLIST_INIT(&vdev_head); + vdev_count = 0; + + SLIST_INIT(®ion_head); + region_count = 0; +} +void +vdev_vm_cleanup(void) +{ + struct vdev *vd; + + // TODO: locking + while (!SLIST_EMPTY(&vdev_head)) { + vd = SLIST_FIRST(&vdev_head); + SLIST_REMOVE_HEAD(&vdev_head, entry); + free(vd, M_VDEV); + vdev_count--; + } +} + +int +vdev_register(struct vdev_ops *ops, void *dev) +{ + struct vdev *vd; + vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); + vd->ops = ops; + vd->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(&vdev_head, vd, entry); + vdev_count++; + return 0; +} + +void +vdev_unregister(void *dev) +{ + struct vdev *vd, *found; + + found = NULL; + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + if (vd->dev == dev) { + found = vd; + } + } + + if (found) { + SLIST_REMOVE(&vdev_head, found, vdev, entry); + free(found, M_VDEV); + } +} + +#define IN_RANGE(val, start, end) \ + (((val) >= (start)) && ((val) < (end))) + +static struct vdev_region* +vdev_find_region(struct io_region *io, void *dev) +{ + struct vdev_region *region, *found; + uint64_t region_base; + uint64_t region_end; + + found = NULL; + + // TODO: locking + // FIXME: we should verify we are in the context the current + // vcpu here as well. + SLIST_FOREACH(region, ®ion_head, entry) { + region_base = region->io->base; + region_end = region_base + region->io->len; + if (IN_RANGE(io->base, region_base, region_end) && + IN_RANGE(io->base+io->len, region_base, region_end+1) && + (dev && dev == region->dev)) { + found = region; + break; + } + } + return found; +} + +int +vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + if (region) { + return -EEXIST; + } + + region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO); + region->io = io; + region->ops = ops; + region->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(®ion_head, region, entry); + region_count++; + + return 0; +} + +void +vdev_unregister_region(void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + + if (region) { + SLIST_REMOVE(®ion_head, region, vdev_region, entry); + free(region, M_VDEV); + region_count--; + } +} + +static int +vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read) +{ + struct vdev_region *region; + struct io_region io; + region_attr_t attr; + int rc; + + io.base = gpa; + io.len = size; + + region = vdev_find_region(&io, NULL); + if (!region) + return -EINVAL; + + attr = (read) ? MMIO_READ : MMIO_WRITE; + if (!(region->io->attr & attr)) + return -EPERM; + + if (read) + rc = region->ops->memread(region->dev, gpa, size, data); + else + rc = region->ops->memwrite(region->dev, gpa, size, *data); + + return rc; +} + +int +vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data) +{ + return vdev_memrw(gpa, size, data, 1); +} + +int +vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data) +{ + return vdev_memrw(gpa, size, &data, 0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.h b/usr/src/uts/i86pc/io/vmm/io/vdev.h new file mode 100644 index 0000000000..dd2df75ad8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vdev.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vdev.h 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VDEV_H_ +#define _VDEV_H_ + +typedef enum { + BYTE = 1, + WORD = 2, + DWORD = 4, + QWORD = 8, +} opsize_t; + +typedef enum { + MMIO_READ = 1, + MMIO_WRITE = 2, +} region_attr_t; + +struct io_region { + uint64_t base; + uint64_t len; + region_attr_t attr; + int vcpu; +}; + +typedef int (*vdev_init_t)(void* dev); +typedef int (*vdev_reset_t)(void* dev); +typedef int (*vdev_halt_t)(void* dev); +typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data); +typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data); + + +struct vdev_ops { + const char *name; + vdev_init_t init; + vdev_reset_t reset; + vdev_halt_t halt; + vdev_memread_t memread; + vdev_memwrite_t memwrite; +}; + + +void vdev_vm_init(void); +void vdev_vm_cleanup(void); + +int vdev_register(struct vdev_ops *ops, void *dev); +void vdev_unregister(void *dev); + +int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io); +void vdev_unregister_region(void *dev, struct io_region *io); + +int vdev_init(void); +int vdev_reset(void); +int vdev_halt(void); +int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data); +int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data); + +#endif /* _VDEV_H_ */ + diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c new file mode 100644 index 0000000000..25f6013da0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c @@ -0,0 +1,821 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/cpuset.h> + +#include <dev/acpica/acpi_hpet.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_lapic.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vhpet.h" + +#include "vmm_ktr.h" + +static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); + +#define HPET_FREQ 10000000 /* 10.0 Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet { + struct vm *vm; + struct mtx mtx; + sbintime_t freq_sbt; + + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t countbase; /* HPET counter base value */ + sbintime_t countbase_sbt; /* uptime corresponding to base value */ + + struct { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + sbintime_t callout_sbt; /* time when counter==compval */ + struct vhpet_callout_arg arg; + } timer[VHPET_NUM_TIMERS]; +}; + +#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) +#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + sbintime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= 0x8086 << 16; /* vendor id */ + cap |= HPET_CAP_LEG_RT; /* legacy routing capable */ + cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ + cap |= 1; /* revision */ + cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ + + cap &= 0xffffffff; + cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + /* + * LegacyReplacement Route configuration takes precedence over MSI + * for timers 0 and 1. + */ + if (n == 0 || n == 1) { + if (vhpet->config & HPET_CNF_LEG_RT) + return (false); + } + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + if (vhpet->config & HPET_CNF_LEG_RT) { + /* + * In "legacy routing" timers 0 and 1 are connected to + * ioapic pins 2 and 8 respectively. + */ + switch (n) { + case 0: + return (2); + case 1: + return (8); + } + } + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static __inline int +vhpet_timer_atpic_pin(struct vhpet *vhpet, int n) +{ + if (vhpet->config & HPET_CNF_LEG_RT) { + /* + * In "legacy routing" timers 0 and 1 are connected to + * 8259 master pin 0 and slave pin 0 respectively. + */ + switch (n) { + case 0: + return (0); + case 1: + return (8); + } + } + + return (-1); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) +{ + uint32_t val; + sbintime_t now, delta; + + val = vhpet->countbase; + if (vhpet_counter_enabled(vhpet)) { + now = sbinuptime(); + delta = now - vhpet->countbase_sbt; +#ifdef __FreeBSD__ + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%#lx to %#lx", vhpet->countbase_sbt, now)); +#else + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%lx to %lx", vhpet->countbase_sbt, now)); +#endif + val += delta / vhpet->freq_sbt; + if (nowptr != NULL) + *nowptr = now; + } else { + /* + * The sbinuptime corresponding to the 'countbase' is + * meaningless when the counter is disabled. Make sure + * that the the caller doesn't want to use it. + */ + KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin, legacy_pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + + legacy_pin = vhpet_timer_atpic_pin(vhpet, n); + if (legacy_pin != -1) + vatpic_deassert_irq(vhpet->vm, legacy_pin); + + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + /* The legacy replacement interrupts are always edge triggered */ + if (vhpet->config & HPET_CNF_LEG_RT) { + if (n == 0 || n == 1) + return (true); + } + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin, legacy_pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + legacy_pin = vhpet_timer_atpic_pin(vhpet, n); + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + if (legacy_pin != -1) + vatpic_pulse_irq(vhpet->vm, legacy_pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + if (legacy_pin != -1) + vatpic_assert_irq(vhpet->vm, legacy_pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + sbintime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + panic("vhpet(%p) callout with counter disabled", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); + return; +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_sbt < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) +{ + sbintime_t delta, precision; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + if (vhpet->timer[n].comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; + precision = delta >> tc_precexp; + vhpet->timer[n].callout_sbt = now + delta; + callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, + precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->countbase_sbt = sbinuptime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->countbase, + vhpet->countbase_sbt); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) +{ + int i; + + vhpet->countbase = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + panic("vhpet timer %d isr should not be asserted", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, + void *arg) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + sbintime_t now, *nowptr; + int i, offset; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'countbase' with the value right + * before it is disabled. + */ + nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; + counter = vhpet_counter(vhpet, nowptr); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = vhpet->isr & data; + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->countbase = val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = val64; + } + vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + struct bintime bt; + + vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); + vhpet->vm = vm; + mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); + + FREQ2BT(HPET_FREQ, &bt); + vhpet->freq_sbt = bttosbt(bt); + + pincount = vioapic_pincount(vm); + if (pincount >= 24) + allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + free(vhpet, M_VHPET); +} + +int +vhpet_getcap(struct vm_hpet_cap *cap) +{ + + cap->capabilities = vhpet_capabilities(); + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h new file mode 100644 index 0000000000..868809d166 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $ + */ + +#ifndef _VHPET_H_ +#define _VHPET_H_ + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 1024 + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size, void *arg); +int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, + int size, void *arg); +int vhpet_getcap(struct vm_hpet_cap *cap); + +#endif /* _VHPET_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c new file mode 100644 index 0000000000..5adf5de16d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c @@ -0,0 +1,514 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <x86/apicreg.h> +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" +#include "vioapic.h" + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 24 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +struct vioapic { + struct vm *vm; + struct mtx mtx; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ + } rtbl[REDIR_ENTRIES]; +}; + +#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) +#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) +#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) + +static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) + +#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) + +#ifdef KTR +static const char * +pinstate_str(bool asserted) +{ + + if (asserted) + return ("asserted"); + else + return ("deasserted"); +} +#endif + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); +} + +static void +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + int oldcnt, newcnt; + bool needintr; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + oldcnt = vioapic->rtbl[pin].acnt; + if (newstate) + vioapic->rtbl[pin].acnt++; + else + vioapic->rtbl[pin].acnt--; + newcnt = vioapic->rtbl[pin].acnt; + + if (newcnt < 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", + pin, newcnt); + } + + needintr = false; + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); + } else if (oldcnt == 1 && newcnt == 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); + } else { + VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", + pin, pinstate_str(newstate), newcnt); + } + + if (needintr) + vioapic_send_intr(vioapic, pin); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + vioapic_set_pinstate(vioapic, irq, true); + vioapic_set_pinstate(vioapic, irq, false); + break; + default: + panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +/* + * Reset the vlapic's trigger-mode register to reflect the ioapic pin + * configuration. + */ +static void +vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +{ + struct vioapic *vioapic; + struct vlapic *vlapic; + uint32_t low, high, dest; + int delmode, pin, vector; + bool level, phys; + + vlapic = vm_lapic(vm, vcpuid); + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + /* + * Reset all vectors to be edge-triggered. + */ + vlapic_reset_tmr(vlapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + level = low & IOART_TRGRLVL ? true : false; + if (!level) + continue; + + /* + * For a level-triggered 'pin' let the vlapic figure out if + * an assertion on this 'pin' would result in an interrupt + * being delivered to it. If yes, then it will modify the + * TMR bit associated with this vector to level-triggered. + */ + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + } + VIOAPIC_UNLOCK(vioapic); +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + break; + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + break; + case IOAPIC_ARB: + return (vioapic->id); + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return (vioapic->rtbl[pin].reg >> rshift); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) +{ + uint64_t data64, mask64; + uint64_t last, changed; + int regnum, pin, lshift; + cpuset_t allvcpus; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + last = vioapic->rtbl[pin].reg; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", + pin, vioapic->rtbl[pin].reg); + + /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then rendezvous all the vcpus + * to update their vlapic trigger-mode registers. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + VIOAPIC_UNLOCK(vioapic); +#if 0 /* XXX */ + allvcpus = vm_active_cpus(vioapic->vm); + vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, + vioapic_update_tmr, NULL); +#endif + VIOAPIC_LOCK(vioapic); + } + + /* + * Generate an interrupt if the following conditions are met: + * - pin is not masked + * - previous interrupt has been EOIed + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + *data); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); + return (error); +} + +int +vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, int vcpuid, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); + + vioapic->vm = vm; + mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + + free(vioapic, M_VIOAPIC); +} + +int +vioapic_pincount(struct vm *vm) +{ + + return (REDIR_ENTRIES); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h new file mode 100644 index 0000000000..9479ebb10e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VIOAPIC_H_ +#define _VIOAPIC_H_ + +#define VIOAPIC_BASE 0xFEC00000 +#define VIOAPIC_SIZE 4096 + +#include "vdev.h" + +struct vm; + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, + uint64_t wval, int size, void *arg); +int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, + uint64_t *rval, int size, void *arg); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c new file mode 100644 index 0000000000..9a0a3058ea --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -0,0 +1,1687 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/clock.h> +#include <machine/smp.h> + +#include <machine/vmm.h> + +#include "vmm_ipi.h" +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vlapic.h" +#include "vlapic_priv.h" +#include "vioapic.h" + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) + +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) + +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + +#define VLAPIC_BUS_FREQ tsc_freq + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + else + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } +#ifdef __FreeBSD__ + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); +#else + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, " + "icr_timer is %x", ccr, lapic->icr_timer)); +#endif + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +int +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return (0); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); + } + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], mask); + + /* + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. + */ + tmrptr = &lapic->tmr0; + if ((tmrptr[idx] & mask) != (level ? mask : 0)) { + VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " + "interrupt is %s-triggered", idx / 4, tmrptr[idx], + level ? "level" : "edge"); + } + + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; + default: + panic("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } +#ifdef __FreeBSD__ + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); +#else + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %x", index, offset)); +#endif + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +{ + uint32_t vec, mode; + + if (lvt & APIC_LVT_M) + return (0); + + vec = lvt & APIC_LVT_VECTOR; + mode = lvt & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + return (0); + } + if (vlapic_set_intr_ready(vlapic, vec, false)) + vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic_page->tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic_page->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i, idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos-- != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << bitpos); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vector = i * 32 + bitpos; + vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, + vector); + } + return; + } + } +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return (lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +{ + uint32_t lvt; + + vlapic->esr_pending |= mask; + if (vlapic->esr_firing) + return; + vlapic->esr_firing = 1; + + // The error LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } + vlapic->esr_firing = 0; +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); + + // The timer LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + uint32_t lvt; + + if (vlapic_enabled(vlapic) == false) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); + break; + case APIC_LVT_LINT1: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); + break; + case APIC_LVT_TIMER: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_ERROR: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_PMC: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); + break; + case APIC_LVT_THERMAL: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); + break; + case APIC_LVT_CMCI: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + break; + default: + return (EINVAL); + } + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + return (0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); +#ifdef __FreeBSD__ + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); +#else + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %lx.%lx, expected at %lx.%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); +#endif + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + sbintime_t sbt; + uint32_t icr_timer; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +static void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + if (vcpuid < VM_MAXCPU) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only avilable in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in it's LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " + "from %#x to %#x", lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +static uint8_t +vlapic_get_tpr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + return (lapic->tpr); +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~0xf) { + vm_inject_gp(vlapic->vm, vlapic->vcpuid); + return; + } + + tpr = val << 4; + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + uint8_t tpr; + + tpr = vlapic_get_tpr(vlapic); + return (tpr >> 4); +} + +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) +{ + int i; + bool phys; + cpuset_t dmask; + uint64_t icrval; + uint32_t dest, vec, mode; + struct vlapic *vlapic2; + struct vm_exit *vmexit; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + phys = ((icrval & APIC_DESTMODE_LOG) == 0); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, + x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + CPU_ZERO(&dmask); /* satisfy gcc */ + break; + } + + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + if (mode == APIC_DELMODE_FIXED) { + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + IPIS_SENT, i, 1); + VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " + "to vcpuid %d", vec, i); + } else { + vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " + "to vcpuid %d", i); + } + } + + return (0); /* handled completely in the kernel */ + } + + if (mode == APIC_DELMODE_INIT) { + if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) + return (0); + + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* move from INIT to waiting-for-SIPI state */ + if (vlapic2->boot_state == BS_INIT) { + vlapic2->boot_state = BS_SIPI; + } + + return (0); + } + } + + if (mode == APIC_DELMODE_STARTUP) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (vlapic2->boot_state != BS_SIPI) + return (0); + + vlapic2->boot_state = BS_RUNNING; + + *retu = true; + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = dest; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + + return (0); + } + } + + /* + * This will cause a return to userland. + */ + return (1); +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) +{ + int vec; + + KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); + + vec = val & 0xff; + lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, + vlapic->vcpuid, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + + irrptr = &lapic->irr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +int +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + + if (offset > sizeof(*lapic)) { + *data = 0; + goto done; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = vlapic_get_tpr(vlapic); + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + *data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_SELF_IPI: + /* + * XXX generate a GP fault if vlapic is in x2apic mode + */ + *data = 0; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); + return 0; +} + +int +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + int retval; + +#ifdef __FreeBSD__ + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %#lx", offset)); +#else + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %lx", offset)); +#endif + + VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", + offset, data); + + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " + "in xAPIC mode", data, offset); + return (0); + } + + retval = 0; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (x2apic(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +static void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_reset_tmr(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); + callout_init(&vlapic->callout, 1); + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + + callout_drain(&vlapic->callout); +} + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +int +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) +{ + + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " + "not supported", vlapic->msr_apicbase, new); + return (-1); + } + + return (0); +} + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &dmask); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vm, vcpuid); + } else { + lapic_set_intr(vm, vcpuid, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + ipi_cpu(hostcpu, ipinum); +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} + +static void +vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *tmrptr, mask; + int idx; + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + if (level) + tmrptr[idx] |= mask; + else + tmrptr[idx] &= ~mask; + + if (vlapic->ops.set_tmr != NULL) + (*vlapic->ops.set_tmr)(vlapic, vector, level); +} + +void +vlapic_reset_tmr(struct vlapic *vlapic) +{ + int vector; + + VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + + for (vector = 0; vector <= 255; vector++) + vlapic_set_tmr(vlapic, vector, false); +} + +void +vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector) +{ + cpuset_t dmask; + bool lowprio; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + + /* + * A level trigger is valid only for fixed and lowprio delivery modes. + */ + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " + "delivery-mode %d", delmode); + return; + } + + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); + + if (!CPU_ISSET(vlapic->vcpuid, &dmask)) + return; + + VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); + vlapic_set_tmr(vlapic, vector, true); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h new file mode 100644 index 0000000000..3fa705d818 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $ + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +struct vm; +enum x2apic_state; + +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + +void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +/* Reset the trigger-mode bits for all vectors to be edge-triggered */ +void vlapic_reset_tmr(struct vlapic *vlapic); + +/* + * Set the trigger-mode bit associated with 'vector' to level-triggered if + * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to + * this 'vlapic'. + */ +void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); +#endif /* _VLAPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h new file mode 100644 index 0000000000..f9bd2e0e8b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $ + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include <x86/apicreg.h> + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + int esr_firing; + + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + + uint64_t msr_apicbase; + enum boot_state boot_state; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in new file mode 100644 index 0000000000..4b1fe1d6b6 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/offsets.in @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> + +#include <machine/pmap.h> + +#include <machine/vmm.h> +#include "intel/vmx_cpufunc.h" +#include "intel/vmx.h" + +vmxctx + tmpstktop VMXCTX_TMPSTKTOP + guest_rdi VMXCTX_GUEST_RDI + guest_rsi VMXCTX_GUEST_RSI + guest_rdx VMXCTX_GUEST_RDX + guest_rcx VMXCTX_GUEST_RCX + guest_r8 VMXCTX_GUEST_R8 + guest_r9 VMXCTX_GUEST_R9 + guest_rax VMXCTX_GUEST_RAX + guest_rbx VMXCTX_GUEST_RBX + guest_rbp VMXCTX_GUEST_RBP + guest_r10 VMXCTX_GUEST_R10 + guest_r11 VMXCTX_GUEST_R11 + guest_r12 VMXCTX_GUEST_R12 + guest_r13 VMXCTX_GUEST_R13 + guest_r14 VMXCTX_GUEST_R14 + guest_r15 VMXCTX_GUEST_R15 + guest_cr2 VMXCTX_GUEST_CR2 + host_r15 VMXCTX_HOST_R15 + host_r14 VMXCTX_HOST_R14 + host_r13 VMXCTX_HOST_R13 + host_r12 VMXCTX_HOST_R12 + host_rbp VMXCTX_HOST_RBP + host_rsp VMXCTX_HOST_RSP + host_rbx VMXCTX_HOST_RBX + host_rip VMXCTX_HOST_RIP + launch_error VMXCTX_LAUNCH_ERROR + +vmx VMX_SIZE + +\#define VM_SUCCESS 0 +\#define VM_FAIL_INVALID 1 +\#define VM_FAIL_VALID 2 + +\#define VMX_RETURN_DIRECT 0 +\#define VMX_RETURN_LONGJMP 1 +\#define VMX_RETURN_VMRESUME 2 +\#define VMX_RETURN_VMLAUNCH 3 +\#define VMX_RETURN_AST 4 + +cpu + cpu_thread + +_kthread + t_lwp + _tu._ts._t_astflag T_ASTFLAG diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c new file mode 100644 index 0000000000..7081368f4a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -0,0 +1,1894 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <x86/psl.h> +#include <sys/systm.h> + +#include <vm/vm.h> + +#include <machine/vm.h> +#include <machine/pcb.h> +#include <machine/smp.h> +#include <x86/apicreg.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_host.h" +#include "vmm_mem.h" +#include "vmm_util.h" +#include "vatpic.h" +#include "vatpit.h" +#include "vhpet.h" +#include "vioapic.h" +#include "vlapic.h" +#include "vmm_ipi.h" +#include "vmm_stat.h" +#include "vmm_lapic.h" + +#ifdef __FreeBSD__ +#include "io/ppt.h" +#include "io/iommu.h" +#endif + +struct vhpet; +struct vioapic; +struct vlapic; + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + struct vlapic *vlapic; + int vcpuid; + struct savefpu *guestfpu; /* guest fpu state */ + void *stats; + struct vm_exit exitinfo; + uint64_t nextrip; /* (x) next instruction to execute */ + enum x2apic_state x2apic_state; + uint64_t exitintinfo; + int nmi_pending; + int extint_pending; + struct vm_exception exception; + int exception_pending; +}; + +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +#define VM_MAX_MEMORY_SEGMENTS 8 + +struct vm { + void *cookie; /* processor-specific data */ + void *iommu; /* iommu-specific data */ + struct vcpu vcpu[VM_MAXCPU]; + struct vhpet *vhpet; + struct vioapic *vioapic; /* virtual ioapic */ + struct vatpic *vatpic; /* virtual atpic */ + struct vatpit *vatpit; /* virtual atpit */ + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + + /* + * Set of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpuset_t active_cpus; + + vm_rendezvous_func_t rendezvous_func; +}; + +static int vmm_initialized; + +static struct vmm_ops *ops; +#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, rip) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ + (ops != NULL ? \ + (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ + ENXIO) +#define VMMMAP_GET(vmi, gpa) \ + (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) +#define VLAPIC_INIT(vmi, vcpu) \ + (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) +#define VLAPIC_CLEANUP(vmi, vlapic) \ + (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +static MALLOC_DEFINE(M_VM, "vm", "vm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static void +vcpu_cleanup(struct vm *vm, int i) +{ + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); +#ifdef __FreeBSD__ + vmm_stat_free(vcpu->stats); +#endif + fpu_save_area_free(vcpu->guestfpu); +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + vcpu_lock_init(vcpu); + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->exitintinfo = 0; + vcpu->guestfpu = fpu_save_area_alloc(); + fpu_save_area_reset(vcpu->guestfpu); +#ifdef __FreeBSD__ + vcpu->stats = vmm_stat_alloc(); +#endif +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + int error; + +#ifndef __FreeBSD__ + vmm_sol_glue_init(); +#endif + + vmm_host_state_init(); +#ifdef __FreeBSD__ + vmm_ipi_init(); +#endif + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + + return (VMM_INIT()); +} + +#ifdef __FreeBSD__ +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + if (ppt_num_devices() > 0) + iommu_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0) { +#ifndef __FreeBSD__ + vmm_sol_glue_cleanup(); +#endif + iommu_cleanup(); + vmm_ipi_cleanup(); + error = VMM_CLEANUP(); + /* + * Something bad happened - prevent new + * VMs from being created + */ + if (error) + vmm_initialized = 0; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - iommu initialization must happen after the pci passthru driver has had + * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). + * + * - VT-x initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); +#else +int +vmm_mod_load() +{ + int error; + + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + + return (error); +} + +int +vmm_mod_unload() +{ + int error; + + error = vmmdev_cleanup(); + if (error) + return (error); + error = VMM_CLEANUP(); + if (error) + return (error); + vmm_initialized = 0; + + return (0); +} +#endif + +int +vm_create(const char *name, struct vm **retvm) +{ + int i; + struct vm *vm; + vm_paddr_t maxaddr; + + const int BSP = 0; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->cookie = VMINIT(vm); + + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu_init(vm, i); + } + +#ifdef __FreeBSD__ + maxaddr = vmm_mem_maxaddr(); + vm->iommu = iommu_create_domain(maxaddr); +#endif + + *retvm = vm; + return (0); +} + +static void +vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +{ + size_t len; + vm_paddr_t hpa; + void *host_domain; + +#ifdef __FreeBSD__ + host_domain = iommu_host_domain(); +#endif + + len = 0; + while (len < seg->len) { + hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); + if (hpa == (vm_paddr_t)-1) { + panic("vm_free_mem_segs: cannot free hpa " + "associated with gpa 0x%016lx", seg->gpa + len); + } + +#ifdef __FreeBSD__ + /* + * Remove the 'gpa' to 'hpa' mapping in VMs domain. + * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'. + */ + iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE); + iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE); +#endif + + vmm_mem_free(hpa, PAGE_SIZE); + + len += PAGE_SIZE; + } + +#ifdef __FreeBSD__ + /* + * Invalidate cached translations associated with 'vm->iommu' since + * we have now moved some pages from it. + */ + iommu_invalidate_tlb(vm->iommu); +#endif + + bzero(seg, sizeof(struct vm_memory_segment)); +} + +void +vm_destroy(struct vm *vm) +{ + int i; + +#ifdef __FreeBSD__ + ppt_unassign_all(vm); +#endif + + for (i = 0; i < vm->num_mem_segs; i++) + vm_free_mem_seg(vm, &vm->mem_segs[i]); + + vm->num_mem_segs = 0; + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_cleanup(vm, i); + + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + +#ifdef __FreeBSD__ + iommu_destroy_domain(vm->iommu); +#endif + + VMCLEANUP(vm->cookie); + + free(vm, M_VM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +#ifdef __FreeBSD__ +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_RW, spok)); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, + VM_PROT_NONE, spok)); +} +#endif + +/* + * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise + */ +static boolean_t +vm_gpa_available(struct vm *vm, vm_paddr_t gpa) +{ + int i; + vm_paddr_t gpabase, gpalimit; + + if (gpa & PAGE_MASK) + panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); + + for (i = 0; i < vm->num_mem_segs; i++) { + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa < gpalimit) + return (FALSE); + } + + return (TRUE); +} + +int +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + int error, available, allocated; + struct vm_memory_segment *seg; + vm_paddr_t g, hpa; + void *host_domain; + + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + return (EINVAL); + + available = allocated = 0; + g = gpa; + while (g < gpa + len) { + if (vm_gpa_available(vm, g)) + available++; + else + allocated++; + + g += PAGE_SIZE; + } + + /* + * If there are some allocated and some available pages in the address + * range then it is an error. + */ + if (allocated && available) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated && available == 0) + return (0); + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + +#ifdef __FreeBSD__ + host_domain = iommu_host_domain(); +#endif + + seg = &vm->mem_segs[vm->num_mem_segs]; + + error = 0; + seg->gpa = gpa; + seg->len = 0; + while (seg->len < len) { + hpa = vmm_mem_alloc(PAGE_SIZE); + if (hpa == 0) { + error = ENOMEM; + break; + } + + error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, + VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); + if (error) + break; + +#ifdef __FreeBSD__ + /* + * Remove the 1:1 mapping for 'hpa' from the 'host_domain'. + * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain. + */ + iommu_remove_mapping(host_domain, hpa, PAGE_SIZE); + iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); +#endif + + seg->len += PAGE_SIZE; + } + + if (error) { + vm_free_mem_seg(vm, seg); + return (error); + } + +#ifdef __FreeBSD__ + /* + * Invalidate cached translations associated with 'host_domain' since + * we have now moved some pages from it. + */ + iommu_invalidate_tlb(host_domain); +#endif + + vm->num_mem_segs++; + + return (0); +} + +vm_paddr_t +vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + vm_paddr_t nextpage; + + nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); + if (len > nextpage - gpa) + panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + return (VMMMAP_GET(vm->cookie, gpa)); +} + +void * +vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ +#ifdef __FreeBSD__ + int count, pageoff; + vm_page_t m; + + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +#else + int pageoff; + vm_paddr_t hpa; + + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + hpa = vm_gpa2hpa(vm, gpa, len); + if (hpa == (vm_paddr_t)-1) + return (NULL); + + return (hat_kpm_pfn2va(btop(hpa)) + pageoff); +#endif +} + +void +vm_gpa_release(void *cookie) +{ +#ifdef __FreeBSD__ + vm_page_t m = cookie; + + vm_page_lock(m); + vm_page_unhold(m); + vm_page_unlock(m); +#endif +} + +int +vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + fpuexit(curthread); + + /* restore guest FPU state */ + fpu_stop_emulating(); + fpurestore(vcpu->guestfpu); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + fpu_start_emulating(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest FPU state */ + fpu_stop_emulating(); + fpusave(vcpu->guestfpu); + fpu_start_emulating(); +} + +static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) +{ + struct vm_exit *vmexit; + struct vcpu *vcpu; + int t, timo, spindown; + + vcpu = &vm->vcpu[vcpuid]; + spindown = 0; + + vcpu_lock(vcpu); + + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. + * + * These interrupts could have happened any time after we + * returned from VMRUN() and before we grabbed the vcpu lock. + */ + if (vm->rendezvous_func == NULL && + !vm_nmi_pending(vm, vcpuid) && + (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { + t = ticks; + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + if (vlapic_enabled(vcpu->vlapic)) { + /* + * XXX msleep_spin() is not interruptible so use the + * 'timo' to put an upper bound on the sleep time. + */ + timo = hz; + msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); + } else { + /* + * Spindown the vcpu if the apic is disabled and it + * had entered the halted state. + */ + spindown = 1; + } + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + vcpu_unlock(vcpu); + +#ifdef __FreeBSD__ + /* + * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it + * outside the confines of the vcpu spinlock. + */ + if (spindown) { + *retu = true; + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; + vm_deactivate_cpu(vm, vcpuid); + VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + } +#endif + + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t gla, gpa, cs_base; + struct vm_guest_paging *paging; + mem_region_read_t mread; + mem_region_write_t mwrite; + enum vm_cpu_mode cpu_mode; + int cs_d, error, length; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; + cs_d = vme->u.inst_emul.cs_d; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; + + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); + + /* Fetch, decode and emulate the faulting instruction */ + if (vie->num_valid == 0) { + /* + * If the instruction length is not known then assume a + * maximum size instruction. + */ + length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + + cs_base, length, vie); + } else { + /* + * The instruction bytes have already been copied into 'vie' + */ + error = 0; + } + if (error == 1) + return (0); /* Resume guest to handle page fault */ + else if (error == -1) + return (EFAULT); + else if (error != 0) + panic("%s: vmm_fetch_instruction error %d", __func__, error); + + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) + return (EFAULT); + + /* + * If the instruction length was not specified then update it now + * along with 'nextrip'. + */ + if (vme->inst_length == 0) { + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + } + + /* return to userland unless this is an in-kernel emulated device */ + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + *retu = true; + return (0); + } + + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, + mread, mwrite, retu); + + return (error); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + struct vcpu *vcpu; + struct pcb *pcb; + uint64_t tscval; + struct vm_exit *vme; + bool retu, intr_disabled; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; +restart: + critical_enter(); + + tscval = rdtsc(); + +#ifdef __FreeBSD__ + pcb = PCPU_GET(curpcb); + set_pcb_flags(pcb, PCB_FULL_IRET); +#endif + +#ifndef __FreeBSD__ + installctx(curthread, vcpu, save_guest_fpustate, + restore_guest_fpustate, NULL, NULL, NULL, NULL); +#endif + restore_guest_fpustate(vcpu); + + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + + save_guest_fpustate(vcpu); +#ifndef __FreeBSD__ + removectx(curthread, vcpu, save_guest_fpustate, + restore_guest_fpustate, NULL, NULL, NULL, NULL); +#endif + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + if (error == 0) { + retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; + switch (vme->exitcode) { + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vm, vcpuid, vme, &retu); + break; + default: + retu = true; /* handled in userland */ + break; + } + } + + if (error == 0 && retu == false) { + goto restart; + } + + /* copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + +#ifdef __FreeBSD__ + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); +#else + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); +#endif + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + +#ifdef __FreeBSD__ + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); +#else + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); +#endif + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", + info1, info2); +#ifdef __FreeBSD__ + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); +#endif + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exception.vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exception.error_code_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exception.error_code << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", + vcpu->exception.vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " + "retinfo(%#lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (exception->vector < 0 || exception->vector >= 32) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->exception_pending) { + VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " + "pending exception %d", exception->vector, + vcpu->exception.vector); + return (EBUSY); + } + + vcpu->exception_pending = 1; + vcpu->exception = *exception; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); + return (0); +} + +int +vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) +{ + struct vcpu *vcpu; + int pending; + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + pending = vcpu->exception_pending; + if (pending) { + vcpu->exception_pending = 0; + *exception = vcpu->exception; + VCPU_CTR1(vm, vcpuid, "Exception %d delivered", + exception->vector); + } + return (pending); +} + +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, + int errcode) +{ + struct vm_exception exception; + struct vm_exit *vmexit; + struct vm *vm; + int error; + + vm = vmarg; + + exception.vector = vector; + exception.error_code = errcode; + exception.error_code_valid = errcode_valid; + error = vm_inject_exception(vm, vcpuid, &exception); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); + + /* + * A fault-like exception allows the instruction to be restarted + * after the exception handler returns. + * + * By setting the inst_length to 0 we ensure that the instruction + * pointer remains at the faulting instruction. + */ + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->inst_length = 0; +} + +void +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) +{ + struct vm *vm; + int error; + + vm = vmarg; + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->extint_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + + return (0); +} + +int +vm_extint_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->extint_pending == 0) + panic("vm_extint_clear: inconsistent extint_pending state"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + return (vm->vhpet); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + return (vm->vioapic); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +#ifdef __FreeBSD__ +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, i, n; + int b, s, f; + char *val, *cp, *cp2; + + /* + * XXX + * The length of an environment variable is limited to 128 bytes which + * puts an upper limit on the number of passthru devices that may be + * specified using a single environment variable. + * + * Work around this by scanning multiple environment variable + * names instead of a single one - yuck! + */ + const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; + + /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ + found = 0; + for (i = 0; names[i] != NULL && !found; i++) { + cp = val = getenv(names[i]); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + } + return (found); +} +#endif + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + vlapic_set_x2apic_state(vm, vcpuid, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + int hostcpu; + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + if (lapic_intr) { + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + } else { + ipi_cpu(hostcpu, vmm_ipinum); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } + vcpu_unlock(vcpu); +} + +int +vm_apicid2vcpuid(struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int idx; + +#ifdef __FreeBSD__ + for (idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) + vm_gpa_release(copyinfo[idx].cookie); + } +#endif + bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva, *cookie; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); + if (error) + return (error); + off = gpa & PAGE_MASK; + n = min(remaining, PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, + prot, &cookie); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + copyinfo[idx].cookie = cookie; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (-1); + } else { + return (0); + } +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, + size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm.conf b/usr/src/uts/i86pc/io/vmm/vmm.conf new file mode 100644 index 0000000000..8833076014 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.conf @@ -0,0 +1 @@ +name="vmm" parent="pseudo"; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c new file mode 100644 index 0000000000..b94caf4009 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $"); + +#include <sys/param.h> +#include <sys/pcpu.h> + +#include <machine/cpufunc.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4; + +void +vmm_host_state_init(void) +{ + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + vmm_host_cr4 = rcr4(); +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_datasel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GDATA_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KDATA, SEL_KPL)); +#endif + +} + +uint64_t +vmm_get_host_codesel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GCODE_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KCODE, SEL_KPL)); +#endif +} + + +uint64_t +vmm_get_host_tsssel(void) +{ + +#ifdef __FreeBSD__ + return (GSEL(GPROC0_SEL, SEL_KPL)); +#else + return (SEL_GDT(GDT_KTSS, SEL_KPL)); +#endif +} + +uint64_t +vmm_get_host_fsbase(void) +{ + +#ifdef __FreeBSD__ + return (0); +#else + return (rdmsr(MSR_FSBASE)); +#endif +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + +#ifdef __FreeBSD__ + return (r_idt.rd_base); +#else + desctbr_t idtr; + + rd_idtr(&idtr); + return (idtr.dtr_base); +#endif +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h new file mode 100644 index 0000000000..5de015a228 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef __FreeBSD__ +#include <sys/cpuvar.h> +#endif + +#ifndef _KERNEL +#error "no user-servicable parts inside" +#endif + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)PCPU_GET(tssp)); +#else + return ((u_long)CPU->cpu_tss); +#endif +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)&gdt[NGDT * curcpu]); +#else + desctbr_t gdtr; + + rd_gdtr(&gdtr); + return (gdtr.dtr_base); +#endif +} + +struct pcpu; +extern struct pcpu __pcpu[]; + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)&__pcpu[curcpu]); +#else + return (rdmsr(MSR_GSBASE)); +#endif +} + +#ifndef __FreeBSD__ +static __inline uint64_t +vmm_get_host_fssel(void) +{ + return (KFS_SEL); +} + +static __inline uint64_t +vmm_get_host_gssel(void) +{ + return (KGS_SEL); +} +#endif +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c new file mode 100644 index 0000000000..72c7056e26 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -0,0 +1,2370 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $"); + +#ifdef _KERNEL +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/vmparam.h> +#include <machine/vmm.h> +#else /* !_KERNEL */ +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/_iovec.h> + +#include <machine/vmm.h> + +#include <assert.h> +#include <vmmapi.h> +#define KASSERT(exp,msg) assert((exp)) +#endif /* _KERNEL */ + +#include <machine/vmm_instruction_emul.h> +#include <x86/psl.h> +#include <x86/specialreg.h> + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) + +static const struct vie_op two_byte_opcodes[256] = { + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x81] = { + /* XXX Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* XXX Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } + return (error); +} + +int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +#define GETCC(sz) \ +static u_long \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(x, y)); + else if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vm, vcpuid, vie, &byte); + if (error == 0) + error = memwrite(vm, vcpuid, gpa, byte, size, arg); + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vm, vcpuid, vie, val); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate & size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, + void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (int8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + * + * Returns 0 on success and 1 if an exception was injected into the guest. + */ +static int +get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, + int opsize, int addrsize, int prot, enum vm_reg_name seg, + enum vm_reg_name gpr, uint64_t *gla) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (1); + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (1); + } + + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + goto done; + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); + if (error) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ + goto done; + } else { + /* + * Case (4): read from and write to mmio. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + PROT_READ, &srcgpa); + if (error) + goto done; + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + PROT_WRITE, &dstgpa); + if (error) + goto done; + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + if (error) + goto done; + } + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + if (error < 0) + return (EFAULT); + else + return (0); +} + +static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t val1, result, rflags, rflags2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x3B: + /* + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare first operand (reg) with second operand (r/m) and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &op1); + if (error) + return (error); + + /* Get the second operand */ + error = memread(vm, vcpuid, gpa, &op2, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, op2); + break; + case 0x81: + case 0x83: + /* + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, size, stackaddrsize, pushop; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compability mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= size; + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, + &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); + if (error == -1) { + /* + * XXX cannot return a negative error value here because it + * ends up being the return value of the VM_RUN() ioctl and + * is interpreted as a pseudo-error (for e.g. ERESTART). + */ + return (EFAULT); + } else if (error == 1) { + /* Resume guest execution to handle page fault */ + return (0); + } + + if (pushop) { + error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + if (error == 0) + vm_copyout(vm, vcpuid, &val, copyinfo, size); + } else { + vm_copyin(vm, vcpuid, copyinfo, &val, size); + error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); + rsp += size; + } + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_POP: + error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_PUSH: + error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = emulate_movx(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_OR: + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_SUB: + error = emulate_sub(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); +#ifdef __FreeBSD__ + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); +#else + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %x", __func__, prot)); +#endif + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ +#ifdef __FreeBSD__ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); +#else + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %x", seg, desc->access)); +#endif + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); +#ifdef __FreeBSD__ + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); +#else + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %x", seg, type)); +#endif + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +#ifdef _KERNEL +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + bzero(vie, sizeof(struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + + if (inst_length) { + bcopy(inst_bytes, vie->inst, inst_length); + vie->num_valid = inst_length; + } +} + +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & VM_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & VM_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa) +{ + int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; +#ifdef __FreeBSD__ +#endif + u_int retries; + uint64_t *ptpbase, ptpphys, pte, pgsize; + uint32_t *ptpbase32, pte32; + void *cookie; + + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & VM_PROT_WRITE; + cookie = NULL; + retval = 0; +#ifdef __FreeBSD__ + retries = 0; +#endif +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); +#ifdef __FreeBSD__ + if (retries++ > 0) + maybe_yield(); +#endif + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + vm_inject_gp(vm, vcpuid); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + goto done; + } + + if (paging->paging_mode == PAGING_MODE_32) { + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~0xfff; + + ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + + if (ptpbase32 == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if ((pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + goto done; + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); + if (ptpbase == NULL) + goto error; + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + ptpphys = pte; + + nlevels = 2; + } else + nlevels = 4; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* Set the accessed bit in the page table entry */ + if ((pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + pfcode = pf_error_code(usermode, prot, 1, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); +done: + ptp_release(&cookie); + return (retval); +error: + retval = -1; + goto done; +fault: + retval = 1; + goto done; +} + +int +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if (inst_length > VIE_INST_SIZE) + panic("vmm_fetch_instruction: invalid length %d", inst_length); + + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, + copyinfo, nitems(copyinfo)); + if (error == 0) { + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + } + return (error); +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; + return (0); +} + +/* + * Verify that all the bytes in the instruction buffer were consumed. + */ +static int +verify_inst_length(struct vie *vie) +{ + + if (vie->num_processed) + return (0); + else + return (-1); +} + +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +{ + int error; + uint64_t base, idx, gla2; + + /* Skip 'gla' verification */ + if (gla == VIE_INVALID_GLA) + return (0); + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_valid; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* XXX assuming that the base address of the segment is 0 */ + gla2 = base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: " + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + base, vie->scale, idx, vie->displacement, gla, gla2); + return (-1); + } + + return (0); +} + +int +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +{ + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + + if (verify_inst_length(vie)) + return (-1); + + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); + } + + vie->decoded = 1; /* success */ + + return (0); +} +#endif /* _KERNEL */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c new file mode 100644 index 0000000000..bea750f162 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include "vatpic.h" +#include "vatpit.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" + +#define MAX_IOPORTS 1280 + +ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { + [TIMER_MODE] = vatpit_handler, + [TIMER_CNTR0] = vatpit_handler, + [TIMER_CNTR1] = vatpit_handler, + [TIMER_CNTR2] = vatpit_handler, + [NMISC_PORT] = vatpit_nmisc_handler, + [IO_ICU1] = vatpic_master_handler, + [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, + [IO_ICU2] = vatpic_slave_handler, + [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, + [IO_ELCR1] = vatpic_elc_handler, + [IO_ELCR2] = vatpic_elc_handler, +}; + +#ifdef KTR +static const char * +inout_instruction(struct vm_exit *vmexit) +{ + int index; + + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd", + "insb", "insw", "insd", + }; + + switch (vmexit->u.inout.bytes) { + case 1: + index = 0; + break; + case 2: + index = 1; + break; + default: + index = 2; + break; + } + + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(index < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* KTR */ + +static int +emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, + bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val; + int error; + + /* + * If there is no handler for the I/O port then punt to userspace. + */ + if (vmexit->u.inout.port >= MAX_IOPORTS || + (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { + *retu = true; + return (0); + } + + mask = vie_size2mask(vmexit->u.inout.bytes); + + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + + error = (*handler)(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error) { + /* + * The value returned by this function is also the return value + * of vm_run(). This needs to be a positive number otherwise it + * can be interpreted as a "pseudo-error" like ERESTART. + * + * Enforce this by mapping all errors to EIO. + */ + return (EIO); + } + + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d setting guest " + "rax register", error)); + } + *retu = false; + return (0); +} + +static int +emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + *retu = true; + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + int bytes, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(vm, vcpuid, vmexit, retu); + else + error = emulate_inout_port(vm, vcpuid, vmexit, retu); + + VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h new file mode 100644 index 0000000000..624dd8f1d8 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $ + */ + +#ifndef _VMM_IOPORT_H_ +#define _VMM_IOPORT_H_ + +typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, + bool in, int port, int bytes, uint32_t *val); + +int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); + +#endif /* _VMM_IOPORT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h new file mode 100644 index 0000000000..4dff03ba1f --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $ + */ + +#ifndef _VMM_IPI_H_ +#define _VMM_IPI_H_ + +#ifdef __FreeBSD__ +int vmm_ipi_alloc(void); +void vmm_ipi_free(int num); +#endif + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h new file mode 100644 index 0000000000..917c7f83a4 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c new file mode 100644 index 0000000000..3215c74a44 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_ipi.h" +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (vector < 32 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vm, cpu, true); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int error; + + if (cpu < -1 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(cpu, &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(cpu, &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} + +static boolean_t +x2apic_msr(u_int msr) +{ + if (msr >= 0x800 && msr <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +boolean_t +lapic_msr(u_int msr) +{ + + if (x2apic_msr(msr) || (msr == MSR_APICBASE)) + return (TRUE); + else + return (FALSE); +} + +int +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_read(vlapic, 0, offset, rval, retu); + } + + return (error); +} + +int +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + error = vlapic_set_apicbase(vlapic, val); + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_write(vlapic, 0, offset, val, retu); + } + + return (error); +} + +int +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_write(vlapic, 1, off, wval, arg); + return (error); +} + +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. + */ + off &= ~3; + if (off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_read(vlapic, 1, off, rval, arg); + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h new file mode 100644 index 0000000000..ee47ee7783 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +boolean_t lapic_msr(u_int num); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, + bool *retu); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval, + bool *retu); + +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h new file mode 100644 index 0000000000..05dc37fb9a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c new file mode 100644 index 0000000000..79e1cb1a44 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -0,0 +1,1040 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Pluribus Networks Inc. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/cpuvar.h> +#include <sys/ioccom.h> +#include <sys/stat.h> +#include <sys/vmsystm.h> +#include <sys/ddi.h> +/* + * struct modctl in <sys/modctl.h> contains "void *__unused". + * Do this ugly workaround to avoid it. + */ +#undef __unused +#include <sys/sunddi.h> +#include <sys/fs/dv_node.h> + +#include <sys/vmm.h> +#include <sys/vmm_instruction_emul.h> +#include <sys/vmm_dev.h> +#include <sys/vmm_impl.h> + +#include <vm/vm.h> +#include <vm/seg_dev.h> + +#include "io/vatpic.h" +#include "io/vioapic.h" +#include "vmm_lapic.h" + +static dev_info_t *vmm_dip; +static void *vmm_statep; + +static SLIST_HEAD(, vmm_softc) head; + +static kmutex_t vmmdev_mtx; + +/* + * vmm trace ring + */ +int vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE; +static vmm_trace_rbuf_t *vmm_debug_rbuf; +static vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void); +static void vmm_trace_dmsg_free(void); +static void vmm_trace_rbuf_alloc(void); +static void vmm_trace_rbuf_free(void); + +/* + * This routine is used to manage debug messages + * on ring buffer. + */ +static vmm_trace_dmsg_t * +vmm_trace_dmsg_alloc(void) +{ + vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp; + + if (vmm_debug_rbuf->looped == TRUE) { + vmm_debug_rbuf->dmsgp = dmsg->next; + return (vmm_debug_rbuf->dmsgp); + } + + /* + * If we're looping for the first time, + * connect the ring. + */ + if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) > + vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) { + dmsg->next = vmm_debug_rbuf->dmsgh; + vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh; + vmm_debug_rbuf->looped = TRUE; + return (vmm_debug_rbuf->dmsgp); + } + + /* If we've gotten this far then memory allocation is needed */ + dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP); + if (dmsg_alloc == NULL) { + vmm_debug_rbuf->allocfailed++; + return (dmsg_alloc); + } else { + vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t); + } + + if (vmm_debug_rbuf->dmsgp != NULL) { + dmsg->next = dmsg_alloc; + vmm_debug_rbuf->dmsgp = dmsg->next; + return (vmm_debug_rbuf->dmsgp); + } else { + /* + * We should only be here if we're initializing + * the ring buffer. + */ + if (vmm_debug_rbuf->dmsgh == NULL) { + vmm_debug_rbuf->dmsgh = dmsg_alloc; + } else { + /* Something is wrong */ + kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t)); + return (NULL); + } + + vmm_debug_rbuf->dmsgp = dmsg_alloc; + return (vmm_debug_rbuf->dmsgp); + } +} + +/* + * Free all messages on debug ring buffer. + */ +static void +vmm_trace_dmsg_free(void) +{ + vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh; + + while (dmsg != NULL) { + dmsg_next = dmsg->next; + kmem_free(dmsg, sizeof (vmm_trace_dmsg_t)); + + /* + * If we've looped around the ring than we're done. + */ + if (dmsg_next == vmm_debug_rbuf->dmsgh) { + break; + } else { + dmsg = dmsg_next; + } + } +} + +static void +vmm_trace_rbuf_alloc(void) +{ + vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP); + + mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL); + + if (vmm_dmsg_ring_size > 0) { + vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size; + } +} + + +static void +vmm_trace_rbuf_free(void) +{ + vmm_trace_dmsg_free(); + mutex_destroy(&vmm_debug_rbuf->lock); + kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t)); +} + +static void +vmm_vtrace_log(const char *fmt, va_list ap) +{ + vmm_trace_dmsg_t *dmsg; + + if (vmm_debug_rbuf == NULL) { + return; + } + + /* + * If max size of ring buffer is smaller than size + * required for one debug message then just return + * since we have no room for the debug message. + */ + if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) { + return; + } + + mutex_enter(&vmm_debug_rbuf->lock); + + /* alloc or reuse on ring buffer */ + dmsg = vmm_trace_dmsg_alloc(); + + if (dmsg == NULL) { + /* resource allocation failed */ + mutex_exit(&vmm_debug_rbuf->lock); + return; + } + + gethrestime(&dmsg->timestamp); + + (void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap); + + mutex_exit(&vmm_debug_rbuf->lock); +} + +void +vmm_trace_log(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmm_vtrace_log(fmt, ap); + va_end(ap); +} + +void +vmmdev_init(void) +{ + vmm_trace_rbuf_alloc(); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + if (error == 0) + vmm_trace_dmsg_free(); + + return (error); +} + +int +vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode, + cred_t *credp, int *rvalp) +{ + int error, vcpu, state_changed; + struct vm_memory_segment seg; + struct vm_register vmreg; + struct vm_seg_desc vmsegdesc; + struct vm_run vmrun; + struct vm_exception vmexc; + struct vm_lapic_irq vmirq; + struct vm_lapic_msi vmmsi; + struct vm_ioapic_irq ioapic_irq; + struct vm_isa_irq isa_irq; + struct vm_capability vmcap; + struct vm_nmi vmnmi; + struct vm_x2apic x2apic; + struct vm_gla2gpa gg; + struct vm_activate_cpu vac; + int pincount; + int i; + + vcpu = -1; + state_changed = 0; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + case VM_PPTDEV_MSIX: + case VM_SET_X2APIC_STATE: + case VM_GLA2GPA: + case VM_ACTIVATE_CPU: + case VM_RESTART_INSTRUCTION: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) { + return (EFAULT); + } + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + goto done; + + state_changed = 1; + break; + case VM_MAP_MEMORY: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = 0; + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + break; + } + + if (error) { + while (--vcpu >= 0) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + goto done; + } + + state_changed = 2; + break; + + default: + break; + } + + switch(cmd) { + case VM_RUN: + if (ddi_copyin((void *)arg, &vmrun, + sizeof (struct vm_run), mode)) { + return (EFAULT); + } + error = vm_run(sc->vm, &vmrun); + if (ddi_copyout(&vmrun, (void *)arg, + sizeof (struct vm_run), mode)) { + return (EFAULT); + } + break; + case VM_LAPIC_IRQ: + if (ddi_copyin((void *)arg, &vmirq, + sizeof (struct vm_lapic_irq), mode)) { + return (EFAULT); + } + error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector); + if (ddi_copyout(&vmirq, (void *)arg, + sizeof (struct vm_lapic_irq), mode)) { + return (EFAULT); + } + break; + case VM_LAPIC_LOCAL_IRQ: + if (ddi_copyin((void *)arg, &vmirq, + sizeof (struct vm_lapic_irq), mode)) { + return (EFAULT); + } + error = lapic_set_local_intr(sc->vm, vmirq.cpuid, + vmirq.vector); + if (ddi_copyout(&vmirq, (void *)arg, + sizeof (struct vm_lapic_irq), mode)) { + return (EFAULT); + } + break; + case VM_LAPIC_MSI: + if (ddi_copyin((void *)arg, &vmmsi, + sizeof (struct vm_lapic_msi), mode)) { + return (EFAULT); + } + error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg); + if (ddi_copyout(&vmmsi, (void *)arg, + sizeof (struct vm_lapic_msi), mode)) { + return (EFAULT); + } + case VM_IOAPIC_ASSERT_IRQ: + if (ddi_copyin((void *)arg, &ioapic_irq, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);; + if (ddi_copyout(&ioapic_irq, (void *)arg, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + break; + case VM_IOAPIC_DEASSERT_IRQ: + if (ddi_copyin((void *)arg, &ioapic_irq, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq); + if (ddi_copyout(&ioapic_irq, (void *)arg, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + break; + case VM_IOAPIC_PULSE_IRQ: + if (ddi_copyin((void *)arg, &ioapic_irq, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq); + if (ddi_copyout(&ioapic_irq, (void *)arg, + sizeof (struct vm_ioapic_irq), mode)) { + return (EFAULT); + } + break; + case VM_IOAPIC_PINCOUNT: + error = 0; + pincount = vioapic_pincount(sc->vm); + if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) { + return (EFAULT); + } + break; + case VM_ISA_ASSERT_IRQ: + if (ddi_copyin((void *)arg, &isa_irq, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + } + error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) + error = vioapic_assert_irq(sc->vm, + isa_irq.ioapic_irq); + if (ddi_copyout(&isa_irq, (void *)arg, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + + } + break; + case VM_ISA_DEASSERT_IRQ: + if (ddi_copyin((void *)arg, &isa_irq, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + } + error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) + error = vioapic_deassert_irq(sc->vm, + isa_irq.ioapic_irq); + if (ddi_copyout(&isa_irq, (void *)arg, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + + } + break; + case VM_ISA_PULSE_IRQ: + if (ddi_copyin((void *)arg, &isa_irq, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + } + error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) + error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq); + if (ddi_copyout(&isa_irq, (void *)arg, + sizeof (struct vm_isa_irq), mode)) { + return (EFAULT); + + } + break; + case VM_MAP_MEMORY: + if (ddi_copyin((void *)arg, &seg, + sizeof (struct vm_memory_segment), mode)) { + return (EFAULT); + } + error = vm_malloc(sc->vm, seg.gpa, seg.len); + break; + case VM_GET_MEMORY_SEG: + if (ddi_copyin((void *)arg, &seg, + sizeof (struct vm_memory_segment), mode)) { + return (EFAULT); + } + seg.len = 0; + (void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg); + if (ddi_copyout(&seg, (void *)arg, + sizeof (struct vm_memory_segment), mode)) { + return (EFAULT); + } + error = 0; + break; + case VM_GET_REGISTER: + if (ddi_copyin((void *)arg, &vmreg, + sizeof (struct vm_register), mode)) { + return (EFAULT); + } + error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum, + &vmreg.regval); + if (!error) { + if (ddi_copyout(&vmreg, (void *)arg, + sizeof (struct vm_register), mode)) { + return (EFAULT); + } + } + break; + case VM_SET_REGISTER: + if (ddi_copyin((void *)arg, &vmreg, + sizeof (struct vm_register), mode)) { + return (EFAULT); + } + error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum, + vmreg.regval); + break; + case VM_SET_SEGMENT_DESCRIPTOR: + if (ddi_copyin((void *)arg, &vmsegdesc, + sizeof (struct vm_seg_desc), mode)) { + return (EFAULT); + } + error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid, + vmsegdesc.regnum, + &vmsegdesc.desc); + break; + case VM_GET_SEGMENT_DESCRIPTOR: + if (ddi_copyin((void *)arg, &vmsegdesc, + sizeof (struct vm_seg_desc), mode)) { + return (EFAULT); + } + error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid, + vmsegdesc.regnum, + &vmsegdesc.desc); + if (!error) { + if (ddi_copyout(&vmsegdesc, (void *)arg, + sizeof (struct vm_seg_desc), mode)) { + return (EFAULT); + } + } + break; + case VM_GET_CAPABILITY: + if (ddi_copyin((void *)arg, &vmcap, + sizeof (struct vm_capability), mode)) { + return (EFAULT); + } + error = vm_get_capability(sc->vm, vmcap.cpuid, + vmcap.captype, + &vmcap.capval); + if (!error) { + if (ddi_copyout(&vmcap, (void *)arg, + sizeof (struct vm_capability), mode)) { + return (EFAULT); + } + } + break; + case VM_SET_CAPABILITY: + if (ddi_copyin((void *)arg, &vmcap, + sizeof (struct vm_capability), mode)) { + return (EFAULT); + } + error = vm_set_capability(sc->vm, vmcap.cpuid, + vmcap.captype, + vmcap.capval); + break; + case VM_SET_X2APIC_STATE: + if (ddi_copyin((void *)arg, &x2apic, + sizeof (struct vm_x2apic), mode)) { + return (EFAULT); + } + error = vm_set_x2apic_state(sc->vm, + x2apic.cpuid, x2apic.state); + break; + case VM_GET_X2APIC_STATE: + if (ddi_copyin((void *)arg, &x2apic, + sizeof (struct vm_x2apic), mode)) { + return (EFAULT); + } + error = vm_get_x2apic_state(sc->vm, + x2apic.cpuid, &x2apic.state); + if (!error) { + if (ddi_copyout(&x2apic, (void *)arg, + sizeof (struct vm_x2apic), mode)) { + return (EFAULT); + } + } + break; + case VM_GLA2GPA: { + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + if (ddi_copyin((void *)arg, &gg, + sizeof (struct vm_gla2gpa), mode)) { + return (EFAULT); + } + error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla, + gg.prot, &gg.gpa); + KASSERT(error == 0 || error == 1 || error == -1, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + if (error >= 0) { + /* + * error = 0: the translation was successful + * error = 1: a fault was injected into the guest + */ + gg.fault = error; + error = 0; + if (ddi_copyout(&gg, (void *)arg, + sizeof (struct vm_gla2gpa), mode)) { + return (EFAULT); + } + } else { + error = EFAULT; + } + break; + } + case VM_ACTIVATE_CPU: + if (ddi_copyin((void *)arg, &vac, + sizeof (struct vm_activate_cpu), mode)) { + return (EFAULT); + } + error = vm_activate_cpu(sc->vm, vac.vcpuid); + break; + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vm, vcpu); + break; + default: + error = ENOTTY; + break; + } + + if (state_changed == 1) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } else if (state_changed == 2) { + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } + +done: + /* Make sure that no handler returns a bogus value like ERESTART */ + KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static +minor_t vmm_find_free_minor(void) +{ + minor_t minor; + + for (minor = 1; ; minor++) { + if (ddi_get_soft_state(vmm_statep, minor) == NULL) + break; + } + + return (minor); +} + +int +vmmdev_do_vm_create(dev_info_t *dip, char *name) +{ + struct vmm_softc *sc; + minor_t minor; + int error; + + mutex_enter(&vmmdev_mtx); + + if (strlen(name) >= VM_MAX_NAMELEN) { + mutex_exit(&vmmdev_mtx); + return (EINVAL); + } + + minor = vmm_find_free_minor(); + if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + + if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + ddi_soft_state_free(vmm_statep, minor); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + strcpy(sc->name, name); + sc->minor = minor; + + if (ddi_create_minor_node(dip, name, S_IFCHR, minor, + DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_soft_state_free(vmm_statep, minor); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + + error = vm_create(name, &sc->vm); + if (error != 0) { + ddi_soft_state_free(vmm_statep, minor); + ddi_remove_minor_node(dip, name); + mutex_exit(&vmmdev_mtx); + return (error); + } + SLIST_INSERT_HEAD(&head, sc, link); + + mutex_exit(&vmmdev_mtx); + + return (0); +} + +static struct vmm_softc * +vmm_lookup(char *name) +{ + struct vmm_softc *sc; + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(sc->name, name) == 0) { + break; + } + } + + return (sc); + +} + +struct vm * +vm_lookup_by_name(char *name) +{ + struct vmm_softc *sc; + + mutex_enter(&vmmdev_mtx); + + if ((sc = vmm_lookup(name)) == NULL) { + mutex_exit(&vmmdev_mtx); + return (NULL); + } + + mutex_exit(&vmmdev_mtx); + + return (sc->vm); +} + +int +vmmdev_do_vm_destroy(dev_info_t *dip, char *name) +{ + struct vmm_softc *sc; + dev_info_t *pdip = ddi_get_parent(dip); + + mutex_enter(&vmmdev_mtx); + + if ((sc = vmm_lookup(name)) == NULL) { + mutex_exit(&vmmdev_mtx); + return (ENOENT); + } + + if (sc->open) { + mutex_exit(&vmmdev_mtx); + return (EBUSY); + } + + vm_destroy(sc->vm); + SLIST_REMOVE(&head, sc, vmm_softc, link); + ddi_remove_minor_node(dip, name); + ddi_soft_state_free(vmm_statep, sc->minor); + (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); + + mutex_exit(&vmmdev_mtx); + + return (0); +} + +int +vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot) +{ + vm_paddr_t paddr; + + mutex_enter(&vmmdev_mtx); + + paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE); + if (paddr == -1) { + return (-1); + } + + mutex_exit(&vmmdev_mtx); + + return (btop(paddr)); +} + + +static int +vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + struct vmm_softc *sc; + + minor = getminor(*devp); + if (minor == VMM_CTL_MINOR) { + /* + * Master control device must be opened exclusively. + */ + if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { + return (EINVAL); + } + + return (0); + } + + mutex_enter(&vmmdev_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmmdev_mtx); + return (ENXIO); + } + + if (sc->open) { + mutex_exit(&vmmdev_mtx); + return (EBUSY); + } + sc->open = B_TRUE; + mutex_exit(&vmmdev_mtx); + + return (0); +} + +static int +vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + struct vmm_softc *sc; + + minor = getminor(dev); + if (minor == VMM_CTL_MINOR) + return (0); + + mutex_enter(&vmmdev_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmmdev_mtx); + return (ENXIO); + } + + sc->open = B_FALSE; + mutex_exit(&vmmdev_mtx); + + return (0); +} + +static int +vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + struct vmm_softc *sc; + struct vmm_ioctl kvi; + minor_t minor; + + minor = getminor(dev); + + if (minor == VMM_CTL_MINOR) { + if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl), + mode)) { + return (EFAULT); + } + switch (cmd) { + case VMM_CREATE_VM: + if ((mode & FWRITE) == 0) + return (EPERM); + return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name)); + case VMM_DESTROY_VM: + if ((mode & FWRITE) == 0) + return (EPERM); + return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name)); + default: + break; + } + } + + sc = ddi_get_soft_state(vmm_statep, minor); + ASSERT(sc); + + return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); +} + +static int +vmm_mmap(dev_t dev, off_t off, int prot) +{ + struct vmm_softc *sc; + + sc = ddi_get_soft_state(vmm_statep, getminor(dev)); + ASSERT(sc); + + return (vmmdev_do_vm_mmap(sc, off, prot)); +} + +static int +vmm_segmap(dev_t dev, off_t off, struct as *as, + caddr_t *addrp, off_t len, unsigned int prot, + unsigned int maxprot, unsigned int flags, cred_t *credp) +{ + struct segdev_crargs dev_a; + int error; + + as_rangelock(as); + + error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (error != 0) { + as_rangeunlock(as); + return (error); + } + + dev_a.mapfunc = vmm_mmap; + dev_a.dev = dev; + dev_a.offset = off; + dev_a.type = (flags & MAP_TYPE); + dev_a.prot = (uchar_t)prot; + dev_a.maxprot = (uchar_t)maxprot; + dev_a.hat_attr = 0; + dev_a.hat_flags = HAT_LOAD_NOCONSIST; + dev_a.devmap_data = NULL; + + error = as_map(as, *addrp, len, segdev_create, &dev_a); + + as_rangeunlock(as); + + return (error); +} + +static int +vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + return (0); +} + +static int +vmm_probe(dev_info_t *dip) +{ + if (driver_installed(ddi_name_to_major("kvm"))) { + cmn_err(CE_WARN, "kvm is installed\n"); + return (DDI_PROBE_FAILURE); + } + + return (DDI_PROBE_SUCCESS); +} + +static int +vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + switch (cmd) { + case DDI_ATTACH: + break; + default: + return (DDI_FAILURE); + } + + if (vmm_mod_load()) { + return (DDI_FAILURE); + } + + vmm_dip = dip; + + /* + * Create control node. Other nodes will be created on demand. + */ + if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR, + VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { + return (DDI_FAILURE); + } + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + default: + return (DDI_FAILURE); + } + + if (vmm_mod_unload()) {; + return (DDI_FAILURE); + } + + /* + * Remove the control node. + */ + ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE); + vmm_dip = NULL; + + return (DDI_SUCCESS); +} + +static struct cb_ops vmm_cb_ops = { + vmm_open, + vmm_close, + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + vmm_ioctl, + nodev, /* devmap */ + vmm_mmap, + vmm_segmap, + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_DEVMAP +}; + +static struct dev_ops vmm_ops = { + DEVO_REV, + 0, + ddi_no_info, + nulldev, /* identify */ + vmm_probe, + vmm_attach, + vmm_detach, + nodev, /* reset */ + &vmm_cb_ops, + (struct bus_ops *)NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "vmm", + &vmm_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + + error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0); + if (error) { + return (error); + } + + error = mod_install(&modlinkage); + if (error) { + ddi_soft_state_fini(&vmm_statep); + } + + return (error); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&modlinkage); + if (error) { + return (error); + } + ddi_soft_state_fini(&vmm_statep); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c new file mode 100644 index 0000000000..6588f5a46d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -0,0 +1,779 @@ +/* + * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $ + */ +/*- + * Copyright (c) 2004 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/types.h> +#include <sys/archsystm.h> +#include <sys/cpuset.h> +#include <sys/fp.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/spl.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> +#include <machine/fpu.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <sys/vmm_impl.h> + +#include <vm/as.h> +#include <vm/seg_kmem.h> + +vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + pfn_t pfn; + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); + ASSERT(pfn != PFN_INVALID); + return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK); +} + +int +cpusetobj_ffs(const cpuset_t *set) +{ +#if CPUSET_WORDS > 1 + int i, cbit; + + cbit = 0; + for (i = 0; i < CPUSET_WORDS; i++) { + if (set->cpub[i] != 0) { + cbit = ffsl(set->cpub[i]); + cbit += i * sizeof (set->cpub[0]); + break; + } + } + return (cbit); +#else + return(ffsl(*set)); +#endif +} + +void +smp_rendezvous(void (* setup_func)(void *), + void (* action_func)(void *), + void (* teardown_func)(void *), + void *arg) +{ + cpuset_t cpuset; + + ASSERT(setup_func == NULL); + ASSERT(teardown_func == NULL); + + CPUSET_ALL(cpuset); + xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func); +} + +struct kmem_item { + void *addr; + size_t size; + LIST_ENTRY(kmem_item) next; +}; +static kmutex_t kmem_items_lock; +static LIST_HEAD(, kmem_item) kmem_items; + +void * +malloc(unsigned long size, struct malloc_type *mtp, int flags) +{ + void *p; + struct kmem_item *i; + int kmem_flag = KM_SLEEP; + + if (flags & M_NOWAIT) + kmem_flag = KM_NOSLEEP; + + if (flags & M_ZERO) { + p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag); + } else { + p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag); + } + + mutex_enter(&kmem_items_lock); + i = p + size; + i->addr = p; + i->size = size; + + LIST_INSERT_HEAD(&kmem_items, i, next); + mutex_exit(&kmem_items_lock); + + return (p); +} + +void +free(void *addr, struct malloc_type *mtp) +{ + struct kmem_item *i; + + mutex_enter(&kmem_items_lock); + LIST_FOREACH(i, &kmem_items, next) { + if (i->addr == addr) + break; + } + ASSERT(i != NULL); + LIST_REMOVE(i, next); + mutex_exit(&kmem_items_lock); + + kmem_free(addr, i->size + sizeof(struct kmem_item)); +} + +void +mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts) +{ + if (opts & MTX_SPIN) { + mutex_init(&mtx->m, name, MUTEX_SPIN, + (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL)); + } else { + mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL); + } +} + +void +mtx_destroy(struct mtx *mtx) +{ + mutex_destroy(&mtx->m); +} + +void +critical_enter(void) +{ + kpreempt_disable(); + thread_affinity_set(curthread, CPU_CURRENT); +} + +void +critical_exit(void) +{ + thread_affinity_clear(curthread); + kpreempt_enable(); +} + +struct unr { + u_int item; + struct unr *link; +}; + +#define UNR_HASHSIZE 8 + +struct unrhdr { + struct mtx *mtx; + struct unr *hash[UNR_HASHSIZE]; + u_int min; + u_int max; + u_int next; +}; + +#define HASH_UNR(uh, i) ((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)]) + +static struct mtx unr_mtx; + +/* + * Allocate a new unrheader set. + * + * Highest and lowest valid values given as parameters. + */ +struct unrhdr * +new_unrhdr(int low, int high, struct mtx *mtx) +{ + struct unrhdr *uh; + + uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP); + if (mtx) { + uh->mtx = mtx; + } else { + uh->mtx = &unr_mtx; + } + uh->min = low; + uh->max = high; + uh->next = uh->min; + + return (uh); +} + +void +delete_unrhdr(struct unrhdr *uh) +{ + kmem_free(uh, sizeof (struct unrhdr)); +} + +static struct unr * +unr_lookup(struct unrhdr *uh, int item) +{ + struct unr *unr; + + ASSERT(MUTEX_HELD(&uh->mtx->m)); + + for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) { + if (unr->item == item) + break; + } + + return (unr); +} + +int +alloc_unr(struct unrhdr *uh) +{ + struct unr *unr; + int item, start; + + mutex_enter(&uh->mtx->m); + start = uh->next; + for (;;) { + item = uh->next; + if (++uh->next == uh->max) { + uh->next = uh->min; + } + + if (unr_lookup(uh, item) == NULL) { + unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP); + unr->item = item; + unr->link = HASH_UNR(uh, item); + HASH_UNR(uh, item) = unr; + break; + } + + if (item == start) { + item = -1; + break; + } + } + mutex_exit(&uh->mtx->m); + + return (item); +} + +void +free_unr(struct unrhdr *uh, u_int item) +{ + struct unr *unr, **unrp; + + mutex_enter(&uh->mtx->m); + unrp = &HASH_UNR(uh, item); + for (;;) { + ASSERT(*unrp != NULL); + if ((*unrp)->item == item) + break; + unrp = &(*unrp)->link; + } + unr = *unrp; + *unrp = unr->link; + mutex_exit(&uh->mtx->m); + kmem_free(unr, sizeof(struct unr)); +} + + +static void +vmm_glue_callout_handler(void *arg) +{ + struct callout *c = arg; + + c->c_flags &= ~CALLOUT_PENDING; + if (c->c_flags & CALLOUT_ACTIVE) { + (c->c_func)(c->c_arg); + } +} + +void +vmm_glue_callout_init(struct callout *c, int mpsafe) +{ + cyc_handler_t hdlr; + cyc_time_t when; + + hdlr.cyh_level = CY_LOW_LEVEL; + hdlr.cyh_func = vmm_glue_callout_handler; + hdlr.cyh_arg = c; + when.cyt_when = CY_INFINITY; + when.cyt_interval = CY_INFINITY; + + mutex_enter(&cpu_lock); + c->c_cyc_id = cyclic_add(&hdlr, &when); + c->c_flags |= CALLOUT_ACTIVE; + mutex_exit(&cpu_lock); +} + +int +vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr, + void (*func)(void *), void *arg, int flags) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + + c->c_func = func; + c->c_arg = arg; + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + + if (flags & C_ABSOLUTE) + cyclic_reprogram(c->c_cyc_id, sbt); + else + cyclic_reprogram(c->c_cyc_id, sbt + gethrtime()); + + return (0); +} + +int +vmm_glue_callout_stop(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + cyclic_reprogram(c->c_cyc_id, CY_INFINITY); + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); + + return (0); +} + +int +vmm_glue_callout_drain(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + mutex_enter(&cpu_lock); + cyclic_remove(c->c_cyc_id); + c->c_cyc_id = CYCLIC_NONE; + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); + mutex_exit(&cpu_lock); + + return (0); +} + +static int +ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) +{ + return (0); +} + +void +ipi_cpu(int cpu, u_int ipi) +{ + cpuset_t set; + + CPUSET_ONLY(set, cpu); + xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set), + ipi_cpu_justreturn); +} + +#define SC_TABLESIZE 256 /* Must be power of 2. */ +#define SC_MASK (SC_TABLESIZE - 1) +#define SC_SHIFT 8 +#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ + SC_MASK) +#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] + +struct sleepqueue { + u_int sq_blockedcnt; /* Num. of blocked threads. */ + LIST_ENTRY(sleepqueue) sq_hash; /* Chain. */ + void *sq_wchan; /* Wait channel. */ + kcondvar_t sq_cv; +}; + +struct sleepqueue_chain { + LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ + struct mtx sc_lock; /* Spin lock for this chain. */ +}; + +static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; + +#define SLEEPQ_CACHE_SZ (64) +static kmem_cache_t *vmm_sleepq_cache; + +static int +vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags) +{ + struct sleepqueue *sq = (struct sleepqueue *)buf; + + bzero(sq, sizeof (struct sleepqueue)); + cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL); + + return (0); +} + +static void +vmm_sleepq_cache_fini(void *buf, void *user_arg) +{ + struct sleepqueue *sq = (struct sleepqueue *)buf; + cv_destroy(&sq->sq_cv); +} + +static void +init_sleepqueues(void) +{ + int i; + + for (i = 0; i < SC_TABLESIZE; i++) { + LIST_INIT(&sleepq_chains[i].sc_queues); + mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, + MTX_SPIN); + } + + vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache", + sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init, + vmm_sleepq_cache_fini, NULL, NULL, NULL, 0); + +} + +/* + * Lock the sleep queue chain associated with the specified wait channel. + */ +static void +sleepq_lock(void *wchan) +{ + struct sleepqueue_chain *sc; + + sc = SC_LOOKUP(wchan); + mtx_lock_spin(&sc->sc_lock); +} + +/* + * Look up the sleep queue associated with a given wait channel in the hash + * table locking the associated sleep queue chain. If no queue is found in + * the table, NULL is returned. + */ +static struct sleepqueue * +sleepq_lookup(void *wchan) +{ + struct sleepqueue_chain *sc; + struct sleepqueue *sq; + + KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); + sc = SC_LOOKUP(wchan); + mtx_assert(&sc->sc_lock, MA_OWNED); + LIST_FOREACH(sq, &sc->sc_queues, sq_hash) + if (sq->sq_wchan == wchan) + return (sq); + return (NULL); +} + +/* + * Unlock the sleep queue chain associated with a given wait channel. + */ +static void +sleepq_release(void *wchan) +{ + struct sleepqueue_chain *sc; + + sc = SC_LOOKUP(wchan); + mtx_unlock_spin(&sc->sc_lock); +} + +struct sleepqueue * +sleepq_add(void *wchan) +{ + struct sleepqueue_chain *sc; + struct sleepqueue *sq; + + sc = SC_LOOKUP(wchan); + + /* Look up the sleep queue associated with the wait channel 'wchan'. */ + sq = sleepq_lookup(wchan); + + if (sq == NULL) { + sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP); + LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); + sq->sq_wchan = wchan; + } + + sq->sq_blockedcnt++; + + return (sq); +} + +void +sleepq_remove(struct sleepqueue *sq) +{ + sq->sq_blockedcnt--; + + if (sq->sq_blockedcnt == 0) { + LIST_REMOVE(sq, sq_hash); + kmem_cache_free(vmm_sleepq_cache, sq); + } +} + +int +msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks) +{ + struct sleepqueue *sq; + int error; + + sleepq_lock(chan); + sq = sleepq_add(chan); + sleepq_release(chan); + + cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK); + + sleepq_lock(chan); + sleepq_remove(sq); + sleepq_release(chan); + + return (error); +} + +void +wakeup(void *chan) +{ + struct sleepqueue *sq; + + sleepq_lock(chan); + sq = sleepq_lookup(chan); + if (sq != NULL) { + cv_broadcast(&sq->sq_cv); + } + sleepq_release(chan); +} + +void +wakeup_one(void *chan) +{ + struct sleepqueue *sq; + + sleepq_lock(chan); + sq = sleepq_lookup(chan); + if (sq != NULL) { + cv_signal(&sq->sq_cv); + } + sleepq_release(chan); +} + +u_int cpu_high; /* Highest arg to CPUID */ +u_int cpu_exthigh; /* Highest arg to extended CPUID */ +u_int cpu_id; /* Stepping ID */ +char cpu_vendor[20]; /* CPU Origin code */ + +static void +vmm_cpuid_init(void) +{ + u_int regs[4]; + + do_cpuid(0, regs); + cpu_high = regs[0]; + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + do_cpuid(1, regs); + cpu_id = regs[0]; + + do_cpuid(0x80000000, regs); + cpu_exthigh = regs[0]; +} + +struct savefpu { + fpu_ctx_t fsa_fp_ctx; +}; + +static vmem_t *fpu_save_area_arena; + +static void +fpu_save_area_init(void) +{ + fpu_save_area_arena = vmem_create("fpu_save_area", + NULL, 0, XSAVE_AREA_ALIGN, + segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP); +} + +static void +fpu_save_area_cleanup(void) +{ + vmem_destroy(fpu_save_area_arena); +} + +struct savefpu * +fpu_save_area_alloc(void) +{ + return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu), + VM_SLEEP)); +} + +void +fpu_save_area_free(struct savefpu *fsa) +{ + vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu)); +} + +void +fpu_save_area_reset(struct savefpu *fsa) +{ + extern const struct fxsave_state sse_initial; + extern const struct xsave_state avx_initial; + struct fpu_ctx *fp; + struct fxsave_state *fx; + struct xsave_state *xs; + + fp = &fsa->fsa_fp_ctx; + + fp->fpu_regs.kfpu_status = 0; + fp->fpu_regs.kfpu_xstatus = 0; + + switch (fp_save_mech) { + case FP_FXSAVE: + fx = &fp->fpu_regs.kfpu_u.kfpu_fx; + bcopy(&sse_initial, fx, sizeof (*fx)); + break; + case FP_XSAVE: + fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 | + XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX); + xs = &fp->fpu_regs.kfpu_u.kfpu_xs; + bcopy(&avx_initial, xs, sizeof (*xs)); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } +} + +void +fpuexit(kthread_t *td) +{ + fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu); +} + +static __inline void +vmm_fxrstor(struct fxsave_state *addr) +{ + __asm __volatile("fxrstor %0" : : "m" (*(addr))); +} + +static __inline void +vmm_fxsave(struct fxsave_state *addr) +{ + __asm __volatile("fxsave %0" : "=m" (*(addr))); +} + +static __inline void +vmm_xrstor(struct xsave_state *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); +} + +static __inline void +vmm_xsave(struct xsave_state *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : + "memory"); +} + +void +fpurestore(void *arg) +{ + struct savefpu *fsa = (struct savefpu *)arg; + struct fpu_ctx *fp; + + fp = &fsa->fsa_fp_ctx; + + switch (fp_save_mech) { + case FP_FXSAVE: + vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx); + break; + case FP_XSAVE: + vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } +} + +void +fpusave(void *arg) +{ + struct savefpu *fsa = (struct savefpu *)arg; + struct fpu_ctx *fp; + + fp = &fsa->fsa_fp_ctx; + + switch (fp_save_mech) { + case FP_FXSAVE: + vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx); + break; + case FP_XSAVE: + vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } +} + +void +vmm_sol_glue_init(void) +{ + vmm_cpuid_init(); + fpu_save_area_init(); + init_sleepqueues(); +} + +void +vmm_sol_glue_cleanup(void) +{ + fpu_save_area_cleanup(); + kmem_cache_destroy(vmm_sleepq_cache); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c new file mode 100644 index 0000000000..3bb5412d16 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> + +#include <vm/vm.h> +#include <machine/pmap.h> + +#include <sys/ddi.h> + +#include "vmm_util.h" +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + clock_t usec = 2 * 1000000; + vm_paddr_t pa; + caddr_t addr; + + if (size != PAGE_SIZE) + panic("vmm_mem_alloc: invalid allocation size %lu", size); + + while (usec > 0) { + if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) { + ASSERT(((uintptr_t)addr & PAGE_MASK) == 0); + pa = vtophys((vm_offset_t)addr); + return (pa); + } + delay(drv_usectohz((clock_t)500000)); + usec -= 500000; + } + + return (NULL); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + page_t *pp; + + if (base & PAGE_MASK) { + panic("vmm_mem_free: base 0x%0lx must be aligned on a " + "0x%0x boundary\n", base, PAGE_SIZE); + } + + if (length != PAGE_SIZE) { + panic("vmm_mem_free: invalid length %lu", length); + } + + pp = page_numtopp_nolock(btop(base)); + kmem_free((void *)pp->p_offset, PAGE_SIZE); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptob(physmax + 1)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h new file mode 100644 index 0000000000..9bf7a60e0b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $ + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + enum vmm_stat_scope scope; +}; + +void vmm_stat_init(void *arg); + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c new file mode 100644 index 0000000000..fabd42e13c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $"); + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#ifdef __FreeBSD__ +#include <sys/proc.h> +#include <machine/frame.h> +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h new file mode 100644 index 0000000000..fe1c1c9449 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s new file mode 100644 index 0000000000..d84ca30275 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmx_assym.s @@ -0,0 +1 @@ +#include "vmx_assym.h" diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c new file mode 100644 index 0000000000..02222ef5e7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/x86.c @@ -0,0 +1,276 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/cpuset.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> + +#include <machine/vmm.h> + +#include "x86.h" + +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "bhyve bhyve "; + +static uint64_t bhyve_xcpuids; + +int +x86_emulate_cpuid(struct vm *vm, int vcpu_id, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + int error; + unsigned int func, regs[4]; + enum x2apic_state x2apic_state; + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && *eax >= 0x80000000) { + if (*eax > cpu_exthigh) + *eax = cpu_exthigh; + } else if (*eax >= 0x40000000) { + if (*eax > CPUID_VM_HIGH) + *eax = CPUID_VM_HIGH; + } else if (*eax > cpu_high) { + *eax = cpu_high; + } + + func = *eax; + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + case CPUID_8000_0008: + cpuid_count(*eax, *ecx, regs); + break; + + case CPUID_8000_0001: + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + cpuid_count(*eax, *ecx, regs); + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + cpuid_count(*eax, *ecx, regs); +#ifdef __FreeBSD__ + /* + * If the host TSCs are not synchronized across + * physical cpus then we cannot advertise an + * invariant tsc to a vcpu. + * + * XXX This still falls short because the vcpu + * can observe the TSC moving backwards as it + * migrates across physical cpus. But at least + * it should discourage the guest from using the + * TSC to keep track of time. + */ + if (!smp_tsc) + regs[3] &= ~AMDPM_TSC_INVARIANT; +#endif + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep or TME capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + + /* + * Hide xsave/osxsave/avx until the FPU save/restore + * issues are resolved + */ + regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE | + CPUID2_AVX); + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Machine check handling is done in the host. + */ + regs[3] &= ~(CPUID_MCA | CPUID_MCE); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Disable multi-core. + */ + regs[1] &= ~CPUID_HTT_CORES; + regs[3] &= ~CPUID_HTT; + break; + + case CPUID_0000_0004: + do_cpuid(4, regs); + + /* + * Do not expose topology. + */ + regs[0] &= 0xffff8000; + /* + * The maximum number of processor cores in + * this physical processor package and the + * maximum number of threads sharing this + * cache are encoded with "plus 1" encoding. + * Adding one to the value in this register + * field to obtains the actual value. + * + * Therefore 0 for both indicates 1 core + * per package and no cache sharing. + */ + break; + + case CPUID_0000_0006: + case CPUID_0000_0007: + case CPUID_0000_000A: + case CPUID_0000_000D: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Processor topology enumeration + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = *ecx & 0xff; + regs[3] = vcpu_id; + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(*eax, *ecx, regs); + break; + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + + return (1); +} diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h new file mode 100644 index 0000000000..db2340b37b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/x86.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); + +#endif diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile index 7fa9f8f6ef..80461a55c1 100644 --- a/usr/src/uts/i86pc/sys/Makefile +++ b/usr/src/uts/i86pc/sys/Makefile @@ -21,6 +21,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2017 Joyent, Inc. # # uts/i86pc/sys/Makefile # @@ -36,7 +37,7 @@ include ../Makefile.i86pc # FILEMODE = 644 -HDRS= \ +CHKHDRS= \ acpidev.h \ amd_iommu.h \ asm_misc.h \ @@ -66,6 +67,15 @@ HDRS= \ xc_levels.h \ xsvc.h +NOCHKHDRS= \ + vmm.h \ + vmm_impl.h \ + vmm_instruction_emul.h + +HDRS= \ + $(CHKHDRS) \ + $(NOCHKHDRS) + ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%) ROOTDIR= $(ROOT)/usr/share/src @@ -74,7 +84,7 @@ ROOTDIRS= $(ROOTDIR)/uts $(ROOTDIR)/uts/$(PLATFORM) ROOTLINK= $(ROOTDIR)/uts/$(PLATFORM)/sys LINKDEST= ../../../../platform/$(PLATFORM)/include/sys -CHECKHDRS= $(HDRS:%.h=%.check) +CHECKHDRS= $(CHKHDRS:%.h=%.check) .KEEP_STATE: diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h new file mode 100644 index 0000000000..a4fb0f2527 --- /dev/null +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -0,0 +1,45 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + */ + +#ifndef _VIONA_IO_H_ +#define _VIONA_IO_H_ + +#define VNA_IOC (('V' << 16)|('C' << 8)) +#define VNA_IOC_CREATE (VNA_IOC | 1) +#define VNA_IOC_DELETE (VNA_IOC | 2) +#define VNA_IOC_RX_RING_INIT (VNA_IOC | 3) +#define VNA_IOC_TX_RING_INIT (VNA_IOC | 4) +#define VNA_IOC_RX_RING_RESET (VNA_IOC | 5) +#define VNA_IOC_TX_RING_RESET (VNA_IOC | 6) +#define VNA_IOC_RX_RING_KICK (VNA_IOC | 7) +#define VNA_IOC_TX_RING_KICK (VNA_IOC | 8) +#define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9) +#define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) + +typedef struct vioc_create { + datalink_id_t c_linkid; + char c_vmname[64]; + size_t c_lomem_size; + size_t c_himem_size; +} vioc_create_t; + +typedef struct vioc_ring_init { + uint16_t ri_qsize; + uint64_t ri_qaddr; +} vioc_ring_init_t; + +#endif /* _VIONA_IO_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h new file mode 100644 index 0000000000..e876ce748f --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -0,0 +1,565 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include <x86/segments.h> + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_LAST +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_MAX_NAMELEN 32 + +#ifdef _KERNEL + +struct vm; +struct vm_exception; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vm_guest_paging; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa, + vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, + boolean_t superpages_ok); +typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_set_func_t vmmmap_set; + vmi_mmap_get_func_t vmmmap_get; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); +#ifdef __FreeBSD__ +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +#endif +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +#ifndef __FreeBSD__ +vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); +#endif +void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, + void **cookie); +void vm_gpa_release(void *cookie); +int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +cpuset_t vm_active_cpus(struct vm *vm); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); + +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +void *vcpu_stats(struct vm *vm, int vcpu); +void vm_interrupt_hostcpu(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); + +/* + * Inject exception 'vme' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); +#endif /* KERNEL */ + +#define VM_MAXCPU 16 /* maximum virtual cpus */ + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_ENABLE_INVPCID, + VM_CAP_MAX +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; + +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_INOUT_STR, + VM_EXITCODE_MAX +}; + +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_inout_str inout_str; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + } hlt; + } u; +}; + +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static __inline void +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + +int vm_restart_instruction(void *vm, int vcpuid); + +#ifndef __FreeBSD__ +#ifdef _KERNEL +extern void vmm_sol_glue_init(void); +extern void vmm_sol_glue_cleanup(void); + +extern int vmm_mod_load(void); +extern int vmm_mod_unload(void); +#endif +#endif + +#endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h new file mode 100644 index 0000000000..3e74eb8786 --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -0,0 +1,334 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + vm_paddr_t gpa; /* in */ + size_t len; + int wired; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_run { + int cpuid; + struct vm_exit vm_exit; +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_lapic_msi { + uint64_t msg; + uint64_t addr; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_ioapic_irq { + int irq; +}; + +struct vm_isa_irq { + int atpic_irq; + int ioapic_irq; +}; + +struct vm_isa_irq_trigger { + int atpic_irq; + enum vm_intr_trigger trigger; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + uint32_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint32_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + +struct vm_hpet_cap { + uint32_t capabilities; /* lower 32 bits of HPET capabilities */ +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_cpuset { + int which; + int cpusetsize; + cpuset_t *cpus; +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, + IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, + IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + + /* interrupt injection */ + IOCNUM_INJECT_EXCEPTION = 30, + IOCNUM_LAPIC_IRQ = 31, + IOCNUM_INJECT_NMI = 32, + IOCNUM_IOAPIC_ASSERT_IRQ = 33, + IOCNUM_IOAPIC_DEASSERT_IRQ = 34, + IOCNUM_IOAPIC_PULSE_IRQ = 35, + IOCNUM_LAPIC_MSI = 36, + IOCNUM_LAPIC_LOCAL_IRQ = 37, + IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, + + /* PCI pass-thru */ + IOCNUM_BIND_PPTDEV = 40, + IOCNUM_UNBIND_PPTDEV = 41, + IOCNUM_MAP_PPTDEV_MMIO = 42, + IOCNUM_PPTDEV_MSI = 43, + IOCNUM_PPTDEV_MSIX = 44, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* kernel device state */ + IOCNUM_SET_X2APIC_STATE = 60, + IOCNUM_GET_X2APIC_STATE = 61, + IOCNUM_GET_HPET_CAPABILITIES = 62, + + /* legacy interrupt injection */ + IOCNUM_ISA_ASSERT_IRQ = 80, + IOCNUM_ISA_DEASSERT_IRQ = 81, + IOCNUM_ISA_PULSE_IRQ = 82, + IOCNUM_ISA_SET_IRQ_TRIGGER = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_LOCAL_IRQ \ + _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_MSI \ + _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) +#define VM_IOAPIC_ASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_DEASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PULSE_IRQ \ + _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PINCOUNT \ + _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) +#define VM_ISA_ASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_DEASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_PULSE_IRQ \ + _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) +#define VM_ISA_SET_IRQ_TRIGGER \ + _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#ifdef __FreeBSD__ +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#endif +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_SET_X2APIC_STATE \ + _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_X2APIC_STATE \ + _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_HPET_CAPABILITIES \ + _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_GET_GPA_PMAP \ + _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) +#endif diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h new file mode 100644 index 0000000000..1602fa286d --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VMM_IMPL_H_ +#define _VMM_IMPL_H_ + +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/varargs.h> + +/* + * /dev names: + * /dev/vmmctl - control device + * /dev/vmm/<name> - vm devices + */ +#define VMM_DRIVER_NAME "vmm" + +#define VMM_CTL_MINOR_NODE "ctl" +#define VMM_CTL_MINOR_NAME VMM_DRIVER_NAME VMM_CTL_NODE +#define VMM_CTL_MINOR 0 + +#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) + +#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) + +struct vmm_ioctl { + char vmm_name[VM_MAX_NAMELEN]; +}; + +#ifdef _KERNEL +struct vmm_softc { + boolean_t open; + minor_t minor; + struct vm *vm; + char name[VM_MAX_NAMELEN]; + SLIST_ENTRY(vmm_softc) link; +}; +#endif + +/* + * VMM trace ring buffer constants + */ +#define VMM_DMSG_RING_SIZE 0x100000 /* 1MB */ +#define VMM_DMSG_BUF_SIZE 256 + +/* + * VMM trace ring buffer content + */ +typedef struct vmm_trace_dmsg { + timespec_t timestamp; + char buf[VMM_DMSG_BUF_SIZE]; + struct vmm_trace_dmsg *next; +} vmm_trace_dmsg_t; + +/* + * VMM trace ring buffer header + */ +typedef struct vmm_trace_rbuf { + kmutex_t lock; /* lock to avoid clutter */ + int looped; /* completed ring */ + int allocfailed; /* dmsg mem alloc failed */ + size_t size; /* current size */ + size_t maxsize; /* max size */ + vmm_trace_dmsg_t *dmsgh; /* messages head */ + vmm_trace_dmsg_t *dmsgp; /* ptr to last message */ +} vmm_trace_rbuf_t; + +/* + * VMM trace ring buffer interfaces + */ +void vmm_trace_log(const char *fmt, ...); + +#endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h new file mode 100644 index 0000000000..8138890a2c --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +#include <sys/mman.h> + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* + * Returns 1 if an alignment check exception should be injected and 0 otherwise. + */ +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * Returns 0 on success and '*gpa' contains the result of the translation. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa); + +void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); + +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile new file mode 100644 index 0000000000..c2b8bd8dcf --- /dev/null +++ b/usr/src/uts/i86pc/viona/Makefile @@ -0,0 +1,72 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = viona +OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile new file mode 100644 index 0000000000..b3ab735781 --- /dev/null +++ b/usr/src/uts/i86pc/vmm/Makefile @@ -0,0 +1,94 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vmm +OBJECTS = $(VMM_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides and additions +# + +# These sources only compile with gcc. Workaround a confluence of cruft +# regarding dmake and shadow compilation by neutering the sun compiler. +amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc +CFLAGS += -_cc=-xdryrun + +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) +PRE_INC_PATH = -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 +INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io +AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR) + +CFLAGS += -_gcc=-Wimplicit-function-declaration + +OFFSETS_SRC = $(CONF_SRCDIR)/offsets.in +ASSYM_H = $(OBJS_DIR)/vmx_assym.h + +CLEANFILES += $(ASSYM_H) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ + +$(OBJECTS): $(ASSYM_H) + +$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@ |