diff options
Diffstat (limited to 'usr/src/uts/intel')
113 files changed, 47643 insertions, 2 deletions
diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 7f464e2ec8..fe380ad770 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -24,6 +24,7 @@ # Copyright 2019 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright 2019 Peter Tribble. +# Copyright 2022 Oxide Computer Company # # @@ -100,6 +101,15 @@ CORE_OBJS += \ prmachdep.o # +# shared hypervisor functionality +# +CORE_OBJS += \ + hma.o \ + hma_asm.o \ + hma_fpu.o \ + smt.o \ + +# # ZFS file system module # ZFS_OBJS += \ @@ -358,3 +368,18 @@ ZEN_UDF_OBJS = zen_udf.o # IMC_OBJS = imc.o imc_decode.o imc_dump.o IMCSTUB_OBJS = imcstub.o + +# +# viona (VirtIO-Net Accelerated) +# +VIONA_OBJS = \ + viona_main.o \ + viona_ring.o \ + viona_rx.o \ + viona_tx.o \ + viona_hook.o + +# +# bhyve PCI-passthru +# +PPT_OBJS = ppt.o diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index 290eae88ff..6a273e0a27 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -750,3 +750,13 @@ DRV_KMODS += usmn zen_udf # Intel Integrated Memory Controller # DRV_KMODS += imc imcstub + +# +# VirtIO-Net Accellerated driver (for bhyve) +# +DRV_KMODS += viona + +# +# bhyve PCI-passthru +# +DRV_KMODS += ppt diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules index 84ecfad278..1ba8885f73 100644 --- a/usr/src/uts/intel/Makefile.rules +++ b/usr/src/uts/intel/Makefile.rules @@ -275,6 +275,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/zfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/viona/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # krtld compiled into unix # diff --git a/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE b/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE new file mode 100644 index 0000000000..66b39dc950 --- /dev/null +++ b/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE @@ -0,0 +1,26 @@ + +SPDX-License-Identifier: BSD-2-Clause-FreeBSD + +Copyright (c) 1992-2020 The FreeBSD Project. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + diff --git a/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE.descrip b/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..77026fc8a3 --- /dev/null +++ b/usr/src/uts/intel/io/viona/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +Bhyve hypervisor diff --git a/usr/src/uts/intel/io/viona/viona.conf b/usr/src/uts/intel/io/viona/viona.conf new file mode 100644 index 0000000000..e66488531a --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2013 Pluribus Networks Inc. +# + +name="viona" parent="pseudo"; diff --git a/usr/src/uts/intel/io/viona/viona.mapfile b/usr/src/uts/intel/io/viona/viona.mapfile new file mode 100644 index 0000000000..cece86348c --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona.mapfile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + local: + *; +}; diff --git a/usr/src/uts/intel/io/viona/viona_hook.c b/usr/src/uts/intel/io/viona/viona_hook.c new file mode 100644 index 0000000000..4520be04b0 --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_hook.c @@ -0,0 +1,438 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/hook.h> +#include <sys/hook_event.h> + +#include "viona_impl.h" + + +/* + * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock + */ +static list_t viona_neti_list; +static kmutex_t viona_neti_lock; + +/* + * viona_neti is allocated and initialized during attach, and read-only + * until detach (where it's also freed) + */ +static net_instance_t *viona_neti; + + +/* + * Generate a hook event for the packet in *mpp headed in the direction + * indicated by 'out'. If the packet is accepted, 0 is returned. If the + * packet is rejected, an error is returned. The hook function may or may not + * alter or even free *mpp. The caller is expected to deal with either + * situation. + */ +int +viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out) +{ + viona_neti_t *nip = link->l_neti; + viona_nethook_t *vnh = &nip->vni_nethook; + hook_pkt_event_t info; + hook_event_t he; + hook_event_token_t het; + int ret; + + he = out ? vnh->vnh_event_out : vnh->vnh_event_in; + het = out ? vnh->vnh_token_out : vnh->vnh_token_in; + + if (!he.he_interested) + return (0); + + info.hpe_protocol = vnh->vnh_neti; + info.hpe_ifp = (phy_if_t)link; + info.hpe_ofp = (phy_if_t)link; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info); + if (ret == 0) + return (0); + + if (out) { + VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, tx_hookdrop); + } else { + VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, rx_hookdrop); + } + return (ret); +} + +/* + * netinfo stubs - required by the nethook framework, but otherwise unused + * + * Currently, all ipf rules are applied against all interfaces in a given + * netstack (e.g. all interfaces in a zone). In the future if we want to + * support being able to apply different rules to different interfaces, I + * believe we would need to implement some of these stubs to map an interface + * name in a rule (e.g. 'net0', back to an index or viona_link_t); + */ +static int +viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused, + char *buf __unused, const size_t len __unused) +{ + return (-1); +} + +static int +viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_getptmue(net_handle_t neti __unused) +{ + return (-1); +} + +static int +viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, size_t nelem __unused, + net_ifaddr_t type[] __unused, void *storage __unused) +{ + return (-1); +} + +static int +viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, zoneid_t *zid __unused) +{ + return (-1); +} + +static int +viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, uint64_t *flags __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused) +{ + return ((phy_if_t)-1); +} + +static phy_if_t +viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused) +{ + return ((phy_if_t)-1); +} + +static lif_if_t +viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_inject(net_handle_t neti __unused, inject_t style __unused, + net_inject_t *packet __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused, + struct sockaddr *next __unused) +{ + return ((phy_if_t)-1); +} + +static int +viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static int +viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static net_protocol_t viona_netinfo = { + NETINFO_VERSION, + NHF_VIONA, + viona_neti_getifname, + viona_neti_getmtu, + viona_neti_getptmue, + viona_neti_getlifaddr, + viona_neti_getlifzone, + viona_neti_getlifflags, + viona_neti_phygetnext, + viona_neti_phylookup, + viona_neti_lifgetnext, + viona_neti_inject, + viona_neti_route, + viona_neti_ispchksum, + viona_neti_isvchksum +}; + +/* + * Create/register our nethooks + */ +static int +viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name, + net_protocol_t *netip) +{ + int ret; + + if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) { + cmn_err(CE_NOTE, "%s: net_protocol_register failed " + "(netid=%d name=%s)", __func__, nid, nh_name); + goto fail_init_proto; + } + + HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name); + if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) { + cmn_err(CE_NOTE, "%s: net_family_register failed " + "(netid=%d name=%s err=%d)", __func__, + nid, nh_name, ret); + goto fail_init_family; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN); + if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_in)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid, + nh_name); + goto fail_init_event_in; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT); + if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_out)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid, + nh_name); + goto fail_init_event_out; + } + return (0); + + /* + * On failure, we undo all the steps that succeeded in the + * reverse order of initialization, starting at the last + * successful step (the labels denoting the failing step). + */ +fail_init_event_out: + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + vnh->vnh_token_in = NULL; + +fail_init_event_in: + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + +fail_init_family: + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; + +fail_init_proto: + return (1); +} + +/* + * Shutdown the nethooks for a protocol family. This triggers notification + * callbacks to anything that has registered interest to allow hook consumers + * to unhook prior to the removal of the hooks as well as makes them unavailable + * to any future consumers as the first step of removal. + */ +static void +viona_nethook_shutdown(viona_nethook_t *vnh) +{ + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); +} + +/* + * Remove the nethooks for a protocol family. + */ +static void +viona_nethook_fini(viona_nethook_t *vnh) +{ + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; +} + +/* + * Callback invoked by the neti module. This creates/registers our hooks + * {IPv4,IPv6}{in,out} with the nethook framework so they are available to + * interested consumers (e.g. ipf). + * + * During attach, viona_neti_create is called once for every netstack + * present on the system at the time of attach. Thereafter, it is called + * during the creation of additional netstack instances (i.e. zone boot). As a + * result, the viona_neti_t that is created during this call always occurs + * prior to any viona instances that will use it to send hook events. + * + * It should never return NULL. If we cannot register our hooks, we do not + * set vnh_hooked of the respective protocol family, which will prevent the + * creation of any viona instances on this netstack (see viona_ioc_create). + * This can only occur if after a shutdown event (which means destruction is + * imminent) we are trying to create a new instance. + */ +static void * +viona_neti_create(const netid_t netid) +{ + viona_neti_t *nip; + + VERIFY(netid != -1); + + nip = kmem_zalloc(sizeof (*nip), KM_SLEEP); + nip->vni_netid = netid; + nip->vni_zid = net_getzoneidbynetid(netid); + mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t), + offsetof(viona_soft_state_t, ss_node)); + + if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA, + &viona_netinfo) == 0) + nip->vni_nethook.vnh_hooked = B_TRUE; + + mutex_enter(&viona_neti_lock); + list_insert_tail(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + return (nip); +} + +/* + * Called during netstack teardown by the neti module. During teardown, all + * the shutdown callbacks are invoked, allowing consumers to release any holds + * and otherwise quiesce themselves prior to destruction, followed by the + * actual destruction callbacks. + */ +static void +viona_neti_shutdown(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&viona_neti_lock); + list_remove(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_shutdown(&nip->vni_nethook); +} + +/* + * Called during netstack teardown by the neti module. Destroys the viona + * netinst data. This is invoked after all the netstack and neti shutdown + * callbacks have been invoked. + */ +static void +viona_neti_destroy(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&nip->vni_lock); + while (nip->vni_ref != 0) + cv_wait(&nip->vni_ref_change, &nip->vni_lock); + mutex_exit(&nip->vni_lock); + + VERIFY(!list_link_active(&nip->vni_node)); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_fini(&nip->vni_nethook); + + mutex_destroy(&nip->vni_lock); + list_destroy(&nip->vni_dev_list); + kmem_free(nip, sizeof (*nip)); +} + +/* + * Find the viona netinst data by zone id. This is only used during + * viona instance creation (and thus is only called by a zone that is running). + */ +viona_neti_t * +viona_neti_lookup_by_zid(zoneid_t zid) +{ + viona_neti_t *nip; + + mutex_enter(&viona_neti_lock); + for (nip = list_head(&viona_neti_list); nip != NULL; + nip = list_next(&viona_neti_list, nip)) { + if (nip->vni_zid == zid) { + mutex_enter(&nip->vni_lock); + nip->vni_ref++; + mutex_exit(&nip->vni_lock); + mutex_exit(&viona_neti_lock); + return (nip); + } + } + mutex_exit(&viona_neti_lock); + return (NULL); +} + +void +viona_neti_rele(viona_neti_t *nip) +{ + mutex_enter(&nip->vni_lock); + VERIFY3S(nip->vni_ref, >, 0); + nip->vni_ref--; + mutex_exit(&nip->vni_lock); + cv_broadcast(&nip->vni_ref_change); +} + +void +viona_neti_attach(void) +{ + mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&viona_neti_list, sizeof (viona_neti_t), + offsetof(viona_neti_t, vni_node)); + + /* This can only fail if NETINFO_VERSION is wrong */ + viona_neti = net_instance_alloc(NETINFO_VERSION); + VERIFY(viona_neti != NULL); + + viona_neti->nin_name = "viona"; + viona_neti->nin_create = viona_neti_create; + viona_neti->nin_shutdown = viona_neti_shutdown; + viona_neti->nin_destroy = viona_neti_destroy; + /* This can only fail if we've registered ourselves multiple times */ + VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS); +} + +void +viona_neti_detach(void) +{ + /* This can only fail if we've not registered previously */ + VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS); + net_instance_free(viona_neti); + viona_neti = NULL; + + list_destroy(&viona_neti_list); + mutex_destroy(&viona_neti_lock); +} diff --git a/usr/src/uts/intel/io/viona/viona_impl.h b/usr/src/uts/intel/io/viona/viona_impl.h new file mode 100644 index 0000000000..760474e78b --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_impl.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VIONA_IMPL_H +#define _VIONA_IMPL_H + +#include <sys/ddi.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/uio.h> + +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/neti.h> +#include <inet/ip.h> +#include <inet/tcp.h> + +#include <sys/vmm_drv.h> +#include <sys/viona_io.h> + +struct viona_link; +typedef struct viona_link viona_link_t; +struct viona_desb; +typedef struct viona_desb viona_desb_t; +struct viona_net; +typedef struct viona_neti viona_neti_t; + +enum viona_ring_state { + VRS_RESET = 0x0, /* just allocated or reset */ + VRS_SETUP = 0x1, /* addrs setup and starting worker thread */ + VRS_INIT = 0x2, /* worker thread started & waiting to run */ + VRS_RUN = 0x3, /* running work routine */ + VRS_STOP = 0x4, /* worker is exiting */ +}; +enum viona_ring_state_flags { + VRSF_REQ_START = 0x1, /* start running from INIT state */ + VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */ + VRSF_RENEW = 0x4, /* ring renewing lease */ +}; + +typedef struct viona_vring { + viona_link_t *vr_link; + + kmutex_t vr_lock; + kcondvar_t vr_cv; + uint16_t vr_state; + uint16_t vr_state_flags; + uint_t vr_xfer_outstanding; + kthread_t *vr_worker_thread; + vmm_lease_t *vr_lease; + + /* ring-sized resources for TX activity */ + viona_desb_t *vr_txdesb; + struct iovec *vr_txiov; + + uint_t vr_intr_enabled; + uint64_t vr_msi_addr; + uint64_t vr_msi_msg; + + /* Internal ring-related state */ + kmutex_t vr_a_mutex; /* sync consumers of 'avail' */ + kmutex_t vr_u_mutex; /* sync consumers of 'used' */ + uint64_t vr_pa; + uint16_t vr_size; + uint16_t vr_mask; /* cached from vr_size */ + uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */ + uint16_t vr_cur_uidx; /* drives 'used_idx' */ + + /* Reference to guest pages holding virtqueue */ + void **vr_map_pages; + vmm_page_t *vr_map_hold; + + /* Per-ring error condition statistics */ + struct viona_ring_stats { + uint64_t rs_ndesc_too_high; + uint64_t rs_bad_idx; + uint64_t rs_indir_bad_len; + uint64_t rs_indir_bad_nest; + uint64_t rs_indir_bad_next; + uint64_t rs_no_space; + uint64_t rs_too_many_desc; + uint64_t rs_desc_bad_len; + + uint64_t rs_bad_ring_addr; + + uint64_t rs_fail_hcksum; + uint64_t rs_fail_hcksum6; + uint64_t rs_fail_hcksum_proto; + + uint64_t rs_bad_rx_frame; + uint64_t rs_rx_merge_overrun; + uint64_t rs_rx_merge_underrun; + uint64_t rs_rx_pad_short; + uint64_t rs_rx_mcast_check; + uint64_t rs_too_short; + uint64_t rs_tx_absent; + + uint64_t rs_rx_hookdrop; + uint64_t rs_tx_hookdrop; + } vr_stats; +} viona_vring_t; + +struct viona_link { + vmm_hold_t *l_vm_hold; + boolean_t l_destroyed; + + viona_vring_t l_vrings[VIONA_VQ_MAX]; + + uint32_t l_features; + uint32_t l_features_hw; + uint32_t l_cap_csum; + + uint16_t l_notify_ioport; + void *l_notify_cookie; + + datalink_id_t l_linkid; + mac_handle_t l_mh; + mac_client_handle_t l_mch; + mac_promisc_handle_t l_mph; + + pollhead_t l_pollhead; + + viona_neti_t *l_neti; +}; + +typedef struct viona_nethook { + net_handle_t vnh_neti; + hook_family_t vnh_family; + hook_event_t vnh_event_in; + hook_event_t vnh_event_out; + hook_event_token_t vnh_token_in; + hook_event_token_t vnh_token_out; + boolean_t vnh_hooked; +} viona_nethook_t; + +struct viona_neti { + list_node_t vni_node; + + netid_t vni_netid; + zoneid_t vni_zid; + + viona_nethook_t vni_nethook; + + kmutex_t vni_lock; /* Protects remaining members */ + kcondvar_t vni_ref_change; /* Protected by vni_lock */ + uint_t vni_ref; /* Protected by vni_lock */ + list_t vni_dev_list; /* Protected by vni_lock */ +}; + +typedef struct used_elem { + uint16_t id; + uint32_t len; +} used_elem_t; + +typedef struct viona_soft_state { + kmutex_t ss_lock; + viona_link_t *ss_link; + list_node_t ss_node; +} viona_soft_state_t; + +#pragma pack(1) +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +}; + +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +}; + +struct virtio_net_mrgrxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +}; + +struct virtio_net_hdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; +}; +#pragma pack() + +#define VRING_NEED_BAIL(ring, proc) \ + (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \ + ((proc)->p_flag & SEXITING) != 0) + + +#define VNETHOOK_INTERESTED_IN(neti) \ + (neti)->vni_nethook.vnh_event_in.he_interested +#define VNETHOOK_INTERESTED_OUT(neti) \ + (neti)->vni_nethook.vnh_event_out.he_interested + + +#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name) +#define VIONA_PROBE1(name, arg1, arg2) \ + DTRACE_PROBE1(viona__##name, arg1, arg2) +#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \ + DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4) +#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6) +#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \ + arg9, arg10) \ + DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \ + arg8, arg9, arg10) +#define VIONA_PROBE_BAD_RING_ADDR(r, a) \ + VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a)) + +#define VIONA_RING_STAT_INCR(r, name) \ + (((r)->vr_stats.rs_ ## name)++) + + +#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \ + IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 +#define VRING_USED_F_NO_NOTIFY 1 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0) +#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1) + +#define VIRTIO_NET_HDR_GSO_NONE 0 +#define VIRTIO_NET_HDR_GSO_TCPV4 1 + +#define VIRTIO_NET_F_CSUM (1 << 0) +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX bufs */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* cfg status field present */ +#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28) +#define VIRTIO_F_RING_EVENT_IDX (1 << 29) + + +void viona_ring_alloc(viona_link_t *, viona_vring_t *); +void viona_ring_free(viona_vring_t *); +int viona_ring_reset(viona_vring_t *, boolean_t); +int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t); +boolean_t viona_ring_lease_renew(viona_vring_t *); + +int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *, + vmm_page_t **); +void vq_pushchain(viona_vring_t *, uint32_t, uint16_t); +void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *); + +void viona_intr_ring(viona_vring_t *ring, boolean_t); +void viona_ring_set_no_notify(viona_vring_t *, boolean_t); +void viona_ring_disable_notify(viona_vring_t *); +void viona_ring_enable_notify(viona_vring_t *); +uint16_t viona_ring_num_avail(viona_vring_t *); + + +void viona_rx_init(void); +void viona_rx_fini(void); +int viona_rx_set(viona_link_t *); +void viona_rx_clear(viona_link_t *); +void viona_worker_rx(viona_vring_t *, viona_link_t *); + +extern kmutex_t viona_force_copy_lock; +void viona_worker_tx(viona_vring_t *, viona_link_t *); +void viona_tx_ring_alloc(viona_vring_t *, const uint16_t); +void viona_tx_ring_free(viona_vring_t *, const uint16_t); + +void viona_neti_attach(void); +void viona_neti_detach(void); +viona_neti_t *viona_neti_lookup_by_zid(zoneid_t); +void viona_neti_rele(viona_neti_t *); +int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t); + +#endif /* _VIONA_IMPL_H */ diff --git a/usr/src/uts/intel/io/viona/viona_main.c b/usr/src/uts/intel/io/viona/viona_main.c new file mode 100644 index 0000000000..a34196ba1a --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_main.c @@ -0,0 +1,1006 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +/* + * viona - VirtIO-Net, Accelerated + * + * The purpose of viona is to provide high performance virtio-net devices to + * bhyve guests. It does so by sitting directly atop MAC, skipping all of the + * DLS/DLD stack. + * + * -------------------- + * General Architecture + * -------------------- + * + * A single viona instance is comprised of a "link" handle and two "rings". + * After opening the viona device, it must be associated with a MAC network + * interface and a bhyve (vmm) instance to form its link resource. This is + * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are + * passed in to perform the initialization. With the MAC client opened, and a + * driver handle to the vmm instance established, the device is ready to be + * configured by the guest. + * + * The userspace portion of bhyve, which interfaces with the PCI device + * emulation framework, is meant to stay out of the datapath if at all + * possible. Configuration changes made via PCI are mapped to actions which + * will steer the operation of the in-kernel logic. + * + * + * ----------- + * Ring Basics + * ----------- + * + * Each viona link has two viona_vring_t entities, RX and TX, for handling data + * transfers to and from the guest. They represent an interface to the + * standard virtio ring structures. When intiailized and active, each ring is + * backed by a kernel worker thread (parented to the bhyve process for the + * instance) which handles ring events. The RX worker has the simple task of + * watching for ring shutdown conditions. The TX worker does that in addition + * to processing all requests to transmit data. Data destined for the guest is + * delivered directly by MAC to viona_rx() when the ring is active. + * + * + * ----------- + * Ring States + * ----------- + * + * The viona_vring_t instances follow a simple path through the possible state + * values represented in virtio_vring_t`vr_state: + * + * +<--------------------------------------------+ + * | | + * V ^ + * +-----------+ This is the initial state when a link is created or + * | VRS_RESET | when the ring has been explicitly reset. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_INIT) issued | + * | | + * | ^ + * V + * +-----------+ The ring parameters (size, guest physical addresses) + * | VRS_SETUP | have been set and start-up of the ring worker thread + * +-----------+ has begun. + * | ^ + * | | + * |---* ring worker thread begins execution | + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested (by ioctl or impending + * | bhyve process death) while the worker thread is + * | starting, the worker will transition the ring to + * | VRS_RESET and exit. + * | ^ + * | | + * | ^ + * V + * +-----------+ The worker thread associated with the ring has started + * | VRS_INIT | executing. It has allocated any extra resources needed + * +-----------+ for the ring to operate. + * | ^ + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested while the worker is + * | waiting in VRS_INIT, it will free any extra resources + * | and transition to VRS_RESET. + * | ^ + * | | + * |--* ioctl(VNA_IOC_RING_KICK) issued | + * | ^ + * V + * +-----------+ The worker thread associated with the ring is executing + * | VRS_RUN | workload specific to that ring. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_RESET) issued | + * | (or bhyve process begins exit) ^ + * | + * +-----------+ The worker thread associated with the ring is in the + * | VRS_STOP | process of exiting. All outstanding TX and RX + * +-----------+ requests are allowed to complete, but new requests + * | must be ignored. + * | ^ + * | | + * +-------------------------------------------->+ + * + * + * While the worker thread is not running, changes to vr_state are only made by + * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts + * the worker, and sets the ring state to VRS_SETUP. Once the worker thread + * has been started, only it may perform ring state transitions (still under + * the protection of vr_lock), when requested by outside consumers via + * vr_state_flags or when the containing bhyve process initiates an exit. + * + * + * ---------------------------- + * Transmission mblk_t Handling + * ---------------------------- + * + * For incoming frames destined for a bhyve guest, the data must first land in + * a host OS buffer from the physical NIC before it is copied into the awaiting + * guest buffer(s). Outbound frames transmitted by the guest are not bound by + * this limitation and can avoid extra copying before the buffers are accessed + * directly by the NIC. When a guest designates buffers to be transmitted, + * viona translates the guest-physical addresses contained in the ring + * descriptors to host-virtual addresses via viona_hold_page(). That pointer is + * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). + * Doing so increments vr_xfer_outstanding, preventing the ring from being + * reset (allowing the link to drop its vmm handle to the guest) until all + * transmit mblks referencing guest memory have been processed. Allocation of + * the viona_desb_t entries is done during the VRS_INIT stage of the ring + * worker thread. The ring size informs that allocation as the number of + * concurrent transmissions is limited by the number of descriptors in the + * ring. This minimizes allocation in the transmit hot-path by acquiring those + * fixed-size resources during initialization. + * + * This optimization depends on the underlying NIC driver freeing the mblks in + * a timely manner after they have been transmitted by the hardware. Some + * drivers have been found to flush TX descriptors only when new transmissions + * are initiated. This means that there is no upper bound to the time needed + * for an mblk to be flushed and can stall bhyve guests from shutting down + * since their memory must be free of viona TX references prior to clean-up. + * + * This expectation of deterministic mblk_t processing is likely the reason + * behind the notable exception to the zero-copy TX path: systems with 'bnxe' + * loaded will copy transmit data into fresh buffers rather than passing up + * zero-copy mblks. It is a hold-over from the original viona sources provided + * by Pluribus and its continued necessity has not been confirmed. + * + * + * ---------------------------- + * Ring Notification Fast-paths + * ---------------------------- + * + * Device operation for viona requires that notifications flow to and from the + * guest to indicate certain ring conditions. In order to minimize latency and + * processing overhead, the notification procedures are kept in-kernel whenever + * possible. + * + * Guest-to-host notifications, when new available descriptors have been placed + * in the ring, are posted via the 'queue notify' address in the virtio BAR. + * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to + * install a callback hook on an ioport address. Guest exits for accesses to + * viona-hooked ioport addresses will result in direct calls to notify the + * appropriate ring worker without a trip to userland. + * + * Host-to-guest notifications in the form of interrupts enjoy similar + * acceleration. Each viona ring can be configured to send MSI notifications + * to the guest as virtio conditions dictate. This in-kernel interrupt + * configuration is kept synchronized through viona ioctls which are utilized + * during writes to the associated PCI config registers or MSI-X BAR. + * + * Guests which do not utilize MSI-X will result in viona falling back to the + * slow path for interrupts. It will poll(2) the viona handle, receiving + * notification when ring events necessitate the assertion of an interrupt. + * + * + * --------------- + * Nethook Support + * --------------- + * + * Viona provides four nethook events that consumers (e.g. ipf) can hook into + * to intercept packets as they go up or down the stack. Unfortunately, + * the nethook framework does not understand raw packets, so we can only + * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, + * we register callbacks with the neti (netinfo) module that will be invoked + * for each netstack already present, as well as for any additional netstack + * instances created as the system operates. These callbacks will + * register/unregister the hooks with the nethook framework for each + * netstack instance. This registration occurs prior to creating any + * viona instances for a given netstack, and the unregistration for a netstack + * instance occurs after all viona instances of the netstack instance have + * been deleted. + */ + +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/stat.h> + +#include <sys/dlpi.h> + +#include "viona_impl.h" + + +#define VIONA_NAME "Virtio Network Accelerator" +#define VIONA_CTL_MINOR 0 +#define VIONA_CLI_NAME "viona" /* MAC client name */ + + +/* + * Host capabilities. + */ +#define VIONA_S_HOSTCAPS ( \ + VIRTIO_NET_F_GUEST_CSUM | \ + VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_STATUS | \ + VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ + VIRTIO_F_RING_INDIRECT_DESC) + +/* MAC_CAPAB_HCKSUM specifics of interest */ +#define VIONA_CAP_HCKSUM_INTEREST \ + (HCKSUM_INET_PARTIAL | \ + HCKSUM_INET_FULL_V4 | \ + HCKSUM_INET_FULL_V6) + +static void *viona_state; +static dev_info_t *viona_dip; +static id_space_t *viona_minors; + + +static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, + void **result); +static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); +static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); +static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, + cred_t *credp, int *rval); +static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); +static int viona_ioc_delete(viona_soft_state_t *, boolean_t); + +static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); +static int viona_ioc_ring_init(viona_link_t *, void *, int); +static int viona_ioc_ring_reset(viona_link_t *, uint_t); +static int viona_ioc_ring_kick(viona_link_t *, uint_t); +static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); +static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); +static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); + +static struct cb_ops viona_cb_ops = { + viona_open, + viona_close, + nodev, + nodev, + nodev, + nodev, + nodev, + viona_ioctl, + nodev, + nodev, + nodev, + viona_chpoll, + ddi_prop_op, + 0, + D_MP | D_NEW | D_HOTPLUG, + CB_REV, + nodev, + nodev +}; + +static struct dev_ops viona_ops = { + DEVO_REV, + 0, + viona_info, + nulldev, + nulldev, + viona_attach, + viona_detach, + nodev, + &viona_cb_ops, + NULL, + ddi_power, + ddi_quiesce_not_needed +}; + +static struct modldrv modldrv = { + &mod_driverops, + VIONA_NAME, + &viona_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + int ret; + + ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); + if (ret != 0) { + return (ret); + } + + viona_minors = id_space_create("viona_minors", + VIONA_CTL_MINOR + 1, UINT16_MAX); + viona_rx_init(); + mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); + + ret = mod_install(&modlinkage); + if (ret != 0) { + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + } + + return (ret); +} + +int +_fini(void) +{ + int ret; + + ret = mod_remove(&modlinkage); + if (ret != 0) { + return (ret); + } + + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* ARGSUSED */ +static int +viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)viona_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + viona_neti_attach(); + + viona_dip = dip; + ddi_report_dev(viona_dip); + + return (DDI_SUCCESS); +} + +static int +viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + dev_info_t *old_dip = viona_dip; + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + VERIFY(old_dip != NULL); + + viona_neti_detach(); + viona_dip = NULL; + ddi_remove_minor_node(old_dip, NULL); + + return (DDI_SUCCESS); +} + +static int +viona_open(dev_t *devp, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } +#if 0 + /* + * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. + * Should the check be at open() or ioctl()? + */ + if (drv_priv(credp) != 0) { + return (EPERM); + } +#endif + if (getminor(*devp) != VIONA_CTL_MINOR) { + return (ENXIO); + } + + minor = id_alloc_nosleep(viona_minors); + if (minor == -1) { + /* All minors are busy */ + return (EBUSY); + } + if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { + id_free(viona_minors, minor); + return (ENOMEM); + } + + ss = ddi_get_soft_state(viona_state, minor); + mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); + *devp = makedevice(getmajor(*devp), minor); + + return (0); +} + +static int +viona_close(dev_t dev, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + minor = getminor(dev); + + ss = ddi_get_soft_state(viona_state, minor); + if (ss == NULL) { + return (ENXIO); + } + + VERIFY0(viona_ioc_delete(ss, B_TRUE)); + VERIFY(!list_link_active(&ss->ss_node)); + ddi_soft_state_free(viona_state, minor); + id_free(viona_minors, minor); + + return (0); +} + +static int +viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) +{ + viona_soft_state_t *ss; + void *dptr = (void *)data; + int err = 0, val; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_CREATE: + return (viona_ioc_create(ss, dptr, md, cr)); + case VNA_IOC_DELETE: + return (viona_ioc_delete(ss, B_FALSE)); + default: + break; + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed || + vmm_drv_release_reqd(link->l_vm_hold)) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_GET_FEATURES: + val = VIONA_S_HOSTCAPS | link->l_features_hw; + if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { + err = EFAULT; + } + break; + case VNA_IOC_SET_FEATURES: + if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { + err = EFAULT; + break; + } + val &= (VIONA_S_HOSTCAPS | link->l_features_hw); + + if ((val & VIRTIO_NET_F_CSUM) == 0) + val &= ~VIRTIO_NET_F_HOST_TSO4; + + if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) + val &= ~VIRTIO_NET_F_GUEST_TSO4; + + link->l_features = val; + break; + case VNA_IOC_RING_INIT: + err = viona_ioc_ring_init(link, dptr, md); + break; + case VNA_IOC_RING_RESET: + err = viona_ioc_ring_reset(link, (uint_t)data); + break; + case VNA_IOC_RING_KICK: + err = viona_ioc_ring_kick(link, (uint_t)data); + break; + case VNA_IOC_RING_SET_MSI: + err = viona_ioc_ring_set_msi(link, dptr, md); + break; + case VNA_IOC_RING_INTR_CLR: + err = viona_ioc_ring_intr_clear(link, (uint_t)data); + break; + case VNA_IOC_INTR_POLL: + err = viona_ioc_intr_poll(link, dptr, md, rv); + break; + case VNA_IOC_SET_NOTIFY_IOP: + if (data < 0 || data > UINT16_MAX) { + err = EINVAL; + break; + } + err = viona_ioc_set_notify_ioport(link, (uint16_t)data); + break; + default: + err = ENOTTY; + break; + } + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + viona_soft_state_t *ss; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + *reventsp = 0; + if ((events & POLLRDBAND) != 0) { + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + if (link->l_vrings[i].vr_intr_enabled != 0) { + *reventsp |= POLLRDBAND; + break; + } + } + } + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &link->l_pollhead; + } + mutex_exit(&ss->ss_lock); + + return (0); +} + +static void +viona_get_mac_capab(viona_link_t *link) +{ + mac_handle_t mh = link->l_mh; + uint32_t cap = 0; + mac_capab_lso_t lso_cap; + + link->l_features_hw = 0; + if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { + /* + * Only report HW checksum ability if the underlying MAC + * resource is capable of populating the L4 header. + */ + if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { + link->l_features_hw |= VIRTIO_NET_F_CSUM; + } + link->l_cap_csum = cap; + } + + if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && + mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { + /* + * Virtio doesn't allow for negotiating a maximum LSO + * packet size. We have to assume that the guest may + * send a maximum length IP packet. Make sure the + * underlying MAC can handle an LSO of this size. + */ + if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && + lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) + link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; + } +} + +static int +viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) +{ + vioc_create_t kvc; + viona_link_t *link = NULL; + char cli_name[MAXNAMELEN]; + int err = 0; + file_t *fp; + vmm_hold_t *hold = NULL; + viona_neti_t *nip = NULL; + zoneid_t zid; + + ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); + + if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { + return (EFAULT); + } + + zid = crgetzoneid(cr); + nip = viona_neti_lookup_by_zid(zid); + if (nip == NULL) { + return (EIO); + } + + if (!nip->vni_nethook.vnh_hooked) { + viona_neti_rele(nip); + return (EIO); + } + + mutex_enter(&ss->ss_lock); + if (ss->ss_link != NULL) { + mutex_exit(&ss->ss_lock); + viona_neti_rele(nip); + return (EEXIST); + } + + if ((fp = getf(kvc.c_vmfd)) == NULL) { + err = EBADF; + goto bail; + } + err = vmm_drv_hold(fp, cr, &hold); + releasef(kvc.c_vmfd); + if (err != 0) { + goto bail; + } + + link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); + link->l_linkid = kvc.c_linkid; + link->l_vm_hold = hold; + + err = mac_open_by_linkid(link->l_linkid, &link->l_mh); + if (err != 0) { + goto bail; + } + + viona_get_mac_capab(link); + + (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, + link->l_linkid); + err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); + if (err != 0) { + goto bail; + } + + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); + + if ((err = viona_rx_set(link)) != 0) { + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + goto bail; + } + + link->l_neti = nip; + ss->ss_link = link; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_insert_tail(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + return (0); + +bail: + if (link != NULL) { + if (link->l_mch != NULL) { + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + kmem_free(link, sizeof (viona_link_t)); + } + if (hold != NULL) { + vmm_drv_rele(hold); + } + viona_neti_rele(nip); + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) +{ + viona_link_t *link; + viona_neti_t *nip = NULL; + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL) { + /* Link destruction already complete */ + mutex_exit(&ss->ss_lock); + return (0); + } + + if (link->l_destroyed) { + /* + * Link destruction has been started by another thread, but has + * not completed. This condition should be impossible to + * encounter when performing the on-close destroy of the link, + * since racing ioctl accessors must necessarily be absent. + */ + VERIFY(!on_close); + mutex_exit(&ss->ss_lock); + return (EAGAIN); + } + /* + * The link deletion cannot fail after this point, continuing until its + * successful completion is reached. + */ + link->l_destroyed = B_TRUE; + + /* + * Tear down the IO port hook so it cannot be used to kick any of the + * rings which are about to be reset and stopped. + */ + VERIFY0(viona_ioc_set_notify_ioport(link, 0)); + mutex_exit(&ss->ss_lock); + + /* + * Return the rings to their reset state, ignoring any possible + * interruptions from signals. + */ + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); + + mutex_enter(&ss->ss_lock); + if (link->l_mch != NULL) { + /* Unhook the receive callbacks and close out the client */ + viona_rx_clear(link); + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + if (link->l_vm_hold != NULL) { + vmm_drv_rele(link->l_vm_hold); + link->l_vm_hold = NULL; + } + + nip = link->l_neti; + link->l_neti = NULL; + + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + pollhead_clean(&link->l_pollhead); + ss->ss_link = NULL; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_remove(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + viona_neti_rele(nip); + + kmem_free(link, sizeof (viona_link_t)); + return (0); +} + +static int +viona_ioc_ring_init(viona_link_t *link, void *udata, int md) +{ + vioc_ring_init_t kri; + int err; + + if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { + return (EFAULT); + } + + err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr); + + return (err); +} + +static int +viona_ioc_ring_reset(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + return (viona_ring_reset(ring, B_TRUE)); +} + +static int +viona_ioc_ring_kick(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + int err; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + mutex_enter(&ring->vr_lock); + switch (ring->vr_state) { + case VRS_SETUP: + /* + * An early kick to a ring which is starting its worker thread + * is fine. Once that thread is active, it will process the + * start-up request immediately. + */ + /* FALLTHROUGH */ + case VRS_INIT: + ring->vr_state_flags |= VRSF_REQ_START; + /* FALLTHROUGH */ + case VRS_RUN: + cv_broadcast(&ring->vr_cv); + err = 0; + break; + default: + err = EBUSY; + break; + } + mutex_exit(&ring->vr_lock); + + return (err); +} + +static int +viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) +{ + vioc_ring_msi_t vrm; + viona_vring_t *ring; + + if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { + return (EFAULT); + } + if (vrm.rm_index >= VIONA_VQ_MAX) { + return (EINVAL); + } + + ring = &link->l_vrings[vrm.rm_index]; + mutex_enter(&ring->vr_lock); + ring->vr_msi_addr = vrm.rm_addr; + ring->vr_msi_msg = vrm.rm_msg; + mutex_exit(&ring->vr_lock); + + return (0); +} + +static int +viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val) +{ + viona_link_t *link = (viona_link_t *)arg; + uint16_t vq = *val; + + if (in) { + /* + * Do not service read (in/ins) requests on this ioport. + * Instead, indicate that the handler is not found, causing a + * fallback to userspace processing. + */ + return (ESRCH); + } + + if (port != link->l_notify_ioport) { + return (EINVAL); + } + return (viona_ioc_ring_kick(link, vq)); +} + +static int +viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) +{ + int err = 0; + + if (link->l_notify_ioport != 0) { + vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); + link->l_notify_ioport = 0; + } + + if (ioport != 0) { + err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, + viona_notify_iop, (void *)link, &link->l_notify_cookie); + if (err == 0) { + link->l_notify_ioport = ioport; + } + } + return (err); +} + +static int +viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) +{ + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + + link->l_vrings[idx].vr_intr_enabled = 0; + return (0); +} + +static int +viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) +{ + uint_t cnt = 0; + vioc_intr_poll_t vip; + + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + uint_t val = link->l_vrings[i].vr_intr_enabled; + + vip.vip_status[i] = val; + if (val != 0) { + cnt++; + } + } + + if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { + return (EFAULT); + } + *rv = (int)cnt; + return (0); +} diff --git a/usr/src/uts/intel/io/viona/viona_ring.c b/usr/src/uts/intel/io/viona/viona_ring.c new file mode 100644 index 0000000000..2d847dda09 --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_ring.c @@ -0,0 +1,960 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + + +#include <sys/disp.h> + +#include "viona_impl.h" + +#define VRING_MAX_LEN 32768 + +/* Layout and sizing as defined in the spec for a legacy-style virtqueue */ + +#define LEGACY_VQ_ALIGN PAGESIZE + +#define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) +/* + * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail + * descriptors (uint16_t each), and (optional) used_event (uint16_t). + */ +#define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) +/* + * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used + * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). + */ +#define LEGACY_USED_SZ(qsz) \ + ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) + +#define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) +#define LEGACY_AVAIL_IDX_OFF(qsz) \ + (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) +#define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ + (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) + +#define LEGACY_USED_FLAGS_OFF(qsz) \ + P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) +#define LEGACY_USED_IDX_OFF(qsz) \ + (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) +#define LEGACY_USED_ENT_OFF(qsz, idx) \ + (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ + (idx) * sizeof (struct virtio_used)) + +#define LEGACY_VQ_SIZE(qsz) \ + (LEGACY_USED_FLAGS_OFF(qsz) + \ + P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) +#define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE) + +struct vq_held_region { + struct iovec *vhr_iov; + vmm_page_t *vhr_head; + vmm_page_t *vhr_tail; + /* Length of iovec array supplied in `vhr_iov` */ + uint_t vhr_niov; + /* + * Index into vhr_iov, indicating the next "free" entry (following the + * last entry which has valid contents). + */ + uint_t vhr_idx; +}; +typedef struct vq_held_region vq_held_region_t; + +static boolean_t viona_ring_map(viona_vring_t *); +static void viona_ring_unmap(viona_vring_t *); +static kthread_t *viona_create_worker(viona_vring_t *); + +static vmm_page_t * +vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) +{ + ASSERT3P(ring->vr_lease, !=, NULL); + + int prot = PROT_READ; + if (writable) { + prot |= PROT_WRITE; + } + + return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); +} + +/* + * Establish a hold on the page(s) which back the region of guest memory covered + * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are + * stored in the iovec array supplied in `region`, along with the chain of + * vmm_page_t entries representing the held pages. Since guest memory + * carries no guarantees of being physically contiguous (on the host), it is + * assumed that an iovec entry will be required for each PAGESIZE section + * covered by the specified `gpa` and `len` range. For each iovec entry + * successfully populated by holding a page, `vhr_idx` will be incremented so it + * references the next available iovec entry (or `vhr_niov`, if the iovec array + * is full). The responsibility for releasing the `vmm_page_t` chain (stored in + * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. + */ +static int +vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, + bool writable, vq_held_region_t *region) +{ + const uint32_t front_offset = gpa & PAGEOFFSET; + const uint32_t front_len = MIN(len, PAGESIZE - front_offset); + uint_t pages = 1; + vmm_page_t *vmp; + caddr_t buf; + + ASSERT3U(region->vhr_idx, <, region->vhr_niov); + + if (front_len < len) { + pages += P2ROUNDUP((uint64_t)(len - front_len), + PAGESIZE) / PAGESIZE; + } + if (pages > (region->vhr_niov - region->vhr_idx)) { + return (E2BIG); + } + + vmp = vq_page_hold(ring, gpa & PAGEMASK, writable); + if (vmp == NULL) { + return (EFAULT); + } + buf = (caddr_t)vmm_drv_page_readable(vmp); + + region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; + region->vhr_iov[region->vhr_idx].iov_len = front_len; + region->vhr_idx++; + gpa += front_len; + len -= front_len; + if (region->vhr_head == NULL) { + region->vhr_head = vmp; + region->vhr_tail = vmp; + } else { + vmm_drv_page_chain(region->vhr_tail, vmp); + region->vhr_tail = vmp; + } + + for (uint_t i = 1; i < pages; i++) { + ASSERT3U(gpa & PAGEOFFSET, ==, 0); + + vmp = vq_page_hold(ring, gpa, writable); + if (vmp == NULL) { + return (EFAULT); + } + buf = (caddr_t)vmm_drv_page_readable(vmp); + + const uint32_t chunk_len = MIN(len, PAGESIZE); + region->vhr_iov[region->vhr_idx].iov_base = buf; + region->vhr_iov[region->vhr_idx].iov_len = chunk_len; + region->vhr_idx++; + gpa += chunk_len; + len -= chunk_len; + vmm_drv_page_chain(region->vhr_tail, vmp); + region->vhr_tail = vmp; + } + + return (0); +} + +static boolean_t +viona_ring_lease_expire_cb(void *arg) +{ + viona_vring_t *ring = arg; + + mutex_enter(&ring->vr_lock); + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + + /* The lease will be broken asynchronously. */ + return (B_FALSE); +} + +static void +viona_ring_lease_drop(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + if (ring->vr_lease != NULL) { + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + + /* + * Without an active lease, the ring mappings cannot be + * considered valid. + */ + viona_ring_unmap(ring); + + vmm_drv_lease_break(hold, ring->vr_lease); + ring->vr_lease = NULL; + } +} + +boolean_t +viona_ring_lease_renew(viona_vring_t *ring) +{ + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + viona_ring_lease_drop(ring); + + /* + * Lease renewal will fail if the VM has requested that all holds be + * cleaned up. + */ + ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, + ring); + if (ring->vr_lease != NULL) { + /* A ring undergoing renewal will need valid guest mappings */ + if (ring->vr_pa != 0 && ring->vr_size != 0) { + /* + * If new mappings cannot be established, consider the + * lease renewal a failure. + */ + if (!viona_ring_map(ring)) { + viona_ring_lease_drop(ring); + return (B_FALSE); + } + } + } + return (ring->vr_lease != NULL); +} + +void +viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) +{ + ring->vr_link = link; + mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); + mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); +} + +static void +viona_ring_misc_free(viona_vring_t *ring) +{ + const uint_t qsz = ring->vr_size; + + viona_tx_ring_free(ring, qsz); +} + +void +viona_ring_free(viona_vring_t *ring) +{ + mutex_destroy(&ring->vr_lock); + cv_destroy(&ring->vr_cv); + mutex_destroy(&ring->vr_a_mutex); + mutex_destroy(&ring->vr_u_mutex); + ring->vr_link = NULL; +} + +int +viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa) +{ + viona_vring_t *ring; + kthread_t *t; + int err = 0; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { + return (EINVAL); + } + if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { + return (EINVAL); + } + + ring = &link->l_vrings[idx]; + mutex_enter(&ring->vr_lock); + if (ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EBUSY); + } + VERIFY(ring->vr_state_flags == 0); + + ring->vr_lease = NULL; + if (!viona_ring_lease_renew(ring)) { + err = EBUSY; + goto fail; + } + + ring->vr_size = qsz; + ring->vr_mask = (ring->vr_size - 1); + ring->vr_pa = pa; + if (!viona_ring_map(ring)) { + err = EINVAL; + goto fail; + } + + /* Initialize queue indexes */ + ring->vr_cur_aidx = 0; + ring->vr_cur_uidx = 0; + + if (idx == VIONA_VQ_TX) { + viona_tx_ring_alloc(ring, qsz); + } + + /* Zero out MSI-X configuration */ + ring->vr_msi_addr = 0; + ring->vr_msi_msg = 0; + + /* Clear the stats */ + bzero(&ring->vr_stats, sizeof (ring->vr_stats)); + + t = viona_create_worker(ring); + if (t == NULL) { + err = ENOMEM; + goto fail; + } + ring->vr_worker_thread = t; + ring->vr_state = VRS_SETUP; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + return (0); + +fail: + viona_ring_lease_drop(ring); + viona_ring_misc_free(ring); + ring->vr_size = 0; + ring->vr_mask = 0; + ring->vr_pa = 0; + mutex_exit(&ring->vr_lock); + return (err); +} + +int +viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) +{ + mutex_enter(&ring->vr_lock); + if (ring->vr_state == VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (0); + } + + if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { + ring->vr_state_flags |= VRSF_REQ_STOP; + cv_broadcast(&ring->vr_cv); + } + while (ring->vr_state != VRS_RESET) { + if (!heed_signals) { + cv_wait(&ring->vr_cv, &ring->vr_lock); + } else { + int rs; + + rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + if (rs <= 0 && ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EINTR); + } + } + } + mutex_exit(&ring->vr_lock); + return (0); +} + +static boolean_t +viona_ring_map(viona_vring_t *ring) +{ + const uint16_t qsz = ring->vr_size; + uintptr_t pa = ring->vr_pa; + + ASSERT3U(qsz, !=, 0); + ASSERT3U(qsz, <=, VRING_MAX_LEN); + ASSERT3U(pa, !=, 0); + ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); + ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3P(ring->vr_map_pages, ==, NULL); + + const uint_t npages = LEGACY_VQ_PAGES(qsz); + ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); + + vmm_page_t *prev = NULL; + + for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) { + vmm_page_t *vmp; + + vmp = vq_page_hold(ring, pa, true); + if (vmp == NULL) { + viona_ring_unmap(ring); + return (B_FALSE); + } + + /* + * Keep the first page has the head of the chain, appending all + * subsequent pages to the tail. + */ + if (prev == NULL) { + ring->vr_map_hold = vmp; + } else { + vmm_drv_page_chain(prev, vmp); + } + prev = vmp; + ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); + } + + return (B_TRUE); +} + +static void +viona_ring_unmap(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + void **map = ring->vr_map_pages; + if (map != NULL) { + const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); + kmem_free(map, npages * sizeof (void *)); + ring->vr_map_pages = NULL; + + vmm_drv_page_release_chain(ring->vr_map_hold); + ring->vr_map_hold = NULL; + } else { + ASSERT3P(ring->vr_map_hold, ==, NULL); + } +} + +static inline void * +viona_ring_addr(viona_vring_t *ring, uint_t off) +{ + ASSERT3P(ring->vr_map_pages, !=, NULL); + ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); + + const uint_t page_num = off / PAGESIZE; + const uint_t page_off = off % PAGESIZE; + return ((caddr_t)ring->vr_map_pages[page_num] + page_off); +} + +void +viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) +{ + if (!skip_flags_check) { + volatile uint16_t *avail_flags = viona_ring_addr(ring, + LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); + + if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { + return; + } + } + + mutex_enter(&ring->vr_lock); + uint64_t addr = ring->vr_msi_addr; + uint64_t msg = ring->vr_msi_msg; + mutex_exit(&ring->vr_lock); + if (addr != 0) { + /* Deliver the interrupt directly, if so configured... */ + (void) vmm_drv_msi(ring->vr_lease, addr, msg); + } else { + /* ... otherwise, leave it to userspace */ + if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { + pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); + } + } +} + +static void +viona_worker(void *arg) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + viona_link_t *link = ring->vr_link; + proc_t *p = ttoproc(curthread); + + mutex_enter(&ring->vr_lock); + VERIFY3U(ring->vr_state, ==, VRS_SETUP); + + /* Bail immediately if ring shutdown or process exit was requested */ + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + + /* Report worker thread as alive and notify creator */ + ring->vr_state = VRS_INIT; + cv_broadcast(&ring->vr_cv); + + while (ring->vr_state_flags == 0) { + /* + * Keeping lease renewals timely while waiting for the ring to + * be started is important for avoiding deadlocks. + */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + } + + ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); + ring->vr_state = VRS_RUN; + ring->vr_state_flags &= ~VRSF_REQ_START; + + /* Ensure ring lease is valid first */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + /* Process actual work */ + if (ring == &link->l_vrings[VIONA_VQ_RX]) { + viona_worker_rx(ring, link); + } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { + viona_worker_tx(ring, link); + } else { + panic("unexpected ring: %p", (void *)ring); + } + + VERIFY3U(ring->vr_state, ==, VRS_STOP); + +cleanup: + if (ring->vr_txdesb != NULL) { + /* + * Transmit activity must be entirely concluded before the + * associated descriptors can be cleaned up. + */ + VERIFY(ring->vr_xfer_outstanding == 0); + } + viona_ring_misc_free(ring); + + viona_ring_lease_drop(ring); + ring->vr_cur_aidx = 0; + ring->vr_size = 0; + ring->vr_mask = 0; + ring->vr_pa = 0; + ring->vr_state = VRS_RESET; + ring->vr_state_flags = 0; + ring->vr_worker_thread = NULL; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + + mutex_enter(&ttoproc(curthread)->p_lock); + lwp_exit(); +} + +static kthread_t * +viona_create_worker(viona_vring_t *ring) +{ + k_sigset_t hold_set; + proc_t *p = curproc; + kthread_t *t; + klwp_t *lwp; + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT(ring->vr_state == VRS_RESET); + + sigfillset(&hold_set); + lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, + minclsyspri - 1, &hold_set, curthread->t_cid, 0); + if (lwp == NULL) { + return (NULL); + } + + t = lwptot(lwp); + mutex_enter(&p->p_lock); + t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; + lwp_create_done(t); + mutex_exit(&p->p_lock); + + return (t); +} + +void +vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) +{ + const uint_t entry_off = idx * sizeof (struct virtio_desc); + + ASSERT3U(idx, <, ring->vr_size); + + bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp)); +} + +static uint16_t +vq_read_avail(viona_vring_t *ring, uint16_t idx) +{ + ASSERT3U(idx, <, ring->vr_size); + + volatile uint16_t *avail_ent = + viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); + return (*avail_ent); +} + +/* + * Given a buffer descriptor `desc`, attempt to map the pages backing that + * region of guest physical memory, taking into account that there are no + * guarantees about guest-contiguous pages being host-contiguous. + */ +static int +vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, + vq_held_region_t *region) +{ + int err; + + if (desc->vd_len == 0) { + VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, + uint32_t, desc->vd_len); + VIONA_RING_STAT_INCR(ring, desc_bad_len); + return (EINVAL); + } + + err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, + (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); + switch (err) { + case E2BIG: + VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, too_many_desc); + break; + case EFAULT: + VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + break; + default: + break; + } + + return (err); +} + +/* + * Walk an indirect buffer descriptor `desc`, attempting to map the pages + * backing the regions of guest memory covered by its contituent descriptors. + */ +static int +vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, + vq_held_region_t *region) +{ + const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); + + if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || + indir_count > ring->vr_size || + desc->vd_addr > (desc->vd_addr + desc->vd_len)) { + VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, + uint32_t, desc->vd_len); + VIONA_RING_STAT_INCR(ring, indir_bad_len); + return (EINVAL); + } + + uint16_t indir_next = 0; + const uint8_t *buf = NULL; + uint64_t buf_gpa = UINT64_MAX; + vmm_page_t *vmp = NULL; + int err = 0; + + for (;;) { + uint64_t indir_gpa = + desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); + uint64_t indir_page = indir_gpa & PAGEMASK; + struct virtio_desc vp; + + /* + * Get a mapping for the page that the next indirect descriptor + * resides in, if has not already been done. + */ + if (indir_page != buf_gpa) { + if (vmp != NULL) { + vmm_drv_page_release(vmp); + } + vmp = vq_page_hold(ring, indir_page, false); + if (vmp == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + err = EFAULT; + break; + } + buf_gpa = indir_page; + buf = vmm_drv_page_readable(vmp); + } + + /* + * A copy of the indirect descriptor is made here, rather than + * simply using a reference pointer. This prevents malicious or + * erroneous guest writes to the descriptor from fooling the + * flags/bounds verification through a race. + */ + bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp)); + + if (vp.vd_flags & VRING_DESC_F_INDIRECT) { + VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, indir_bad_nest); + err = EINVAL; + break; + } else if (vp.vd_len == 0) { + VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, + uint32_t, vp.vd_len); + VIONA_RING_STAT_INCR(ring, desc_bad_len); + err = EINVAL; + break; + } + + err = vq_map_desc_bufs(ring, &vp, region); + if (err != 0) { + break; + } + + /* Successfully reach the end of the indir chain */ + if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { + break; + } + if (region->vhr_idx >= region->vhr_niov) { + VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, too_many_desc); + err = E2BIG; + break; + } + + indir_next = vp.vd_next; + if (indir_next >= indir_count) { + VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, + uint16_t, indir_next, uint16_t, indir_count); + VIONA_RING_STAT_INCR(ring, indir_bad_next); + err = EINVAL; + break; + } + } + + if (vmp != NULL) { + vmm_drv_page_release(vmp); + } + return (err); +} + +int +vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, + uint16_t *cookie, vmm_page_t **chain) +{ + uint16_t ndesc, idx, head, next; + struct virtio_desc vdir; + vq_held_region_t region = { + .vhr_niov = niov, + .vhr_iov = iov, + }; + + ASSERT(iov != NULL); + ASSERT(niov > 0 && niov < INT_MAX); + ASSERT(*chain == NULL); + + mutex_enter(&ring->vr_a_mutex); + idx = ring->vr_cur_aidx; + ndesc = viona_ring_num_avail(ring); + + if (ndesc == 0) { + mutex_exit(&ring->vr_a_mutex); + return (0); + } + if (ndesc > ring->vr_size) { + /* + * Despite the fact that the guest has provided an 'avail_idx' + * which indicates that an impossible number of descriptors are + * available, continue on and attempt to process the next one. + * + * The transgression will not escape the probe or stats though. + */ + VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, + uint16_t, ndesc); + VIONA_RING_STAT_INCR(ring, ndesc_too_high); + } + + head = vq_read_avail(ring, idx & ring->vr_mask); + next = head; + + for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { + if (next >= ring->vr_size) { + VIONA_PROBE2(bad_idx, viona_vring_t *, ring, + uint16_t, next); + VIONA_RING_STAT_INCR(ring, bad_idx); + break; + } + + vq_read_desc(ring, next, &vdir); + if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { + if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { + break; + } + } else { + /* + * Per the specification (Virtio 1.1 S2.6.5.3.1): + * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT + * and VIRTQ_DESC_F_NEXT in `flags`. + */ + if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { + VIONA_PROBE3(indir_bad_next, + viona_vring_t *, ring, + uint16_t, next, uint16_t, 0); + VIONA_RING_STAT_INCR(ring, indir_bad_next); + break; + } + + if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { + break; + } + } + + if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { + ring->vr_cur_aidx++; + mutex_exit(&ring->vr_a_mutex); + + *cookie = head; + *chain = region.vhr_head; + return (region.vhr_idx); + } + } + + mutex_exit(&ring->vr_a_mutex); + if (region.vhr_head != NULL) { + /* + * If any pages were held prior to encountering an error, we + * must release them now. + */ + vmm_drv_page_release_chain(region.vhr_head); + } + return (-1); +} + + +static void +vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, + uint32_t len) +{ + /* + * In a larger ring, entry could be split across pages, so be sure to + * account for that when configuring the transfer by looking up the ID + * and length addresses separately, rather than an address for a + * combined `struct virtio_used`. + */ + const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); + const uint_t used_len_off = used_id_off + sizeof (uint32_t); + volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); + volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); + + ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); + + *idp = cookie; + *lenp = len; +} + +static void +vq_write_used_idx(viona_vring_t *ring, uint16_t idx) +{ + ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); + + volatile uint16_t *used_idx = + viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); + *used_idx = idx; +} + +void +vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + uint16_t uidx; + + mutex_enter(&ring->vr_u_mutex); + + uidx = ring->vr_cur_uidx; + vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); + uidx++; + membar_producer(); + + vq_write_used_idx(ring, uidx); + ring->vr_cur_uidx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} + +void +vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) +{ + uint16_t uidx; + + mutex_enter(&ring->vr_u_mutex); + + uidx = ring->vr_cur_uidx; + + for (uint_t i = 0; i < num_bufs; i++, uidx++) { + vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, + elem[i].len); + } + + membar_producer(); + vq_write_used_idx(ring, uidx); + ring->vr_cur_uidx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} + +/* + * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. + */ +void +viona_ring_disable_notify(viona_vring_t *ring) +{ + volatile uint16_t *used_flags = + viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); + + *used_flags |= VRING_USED_F_NO_NOTIFY; +} + +/* + * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. + */ +void +viona_ring_enable_notify(viona_vring_t *ring) +{ + volatile uint16_t *used_flags = + viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); + + *used_flags &= ~VRING_USED_F_NO_NOTIFY; +} + +/* + * Return the number of available descriptors in the vring taking care of the + * 16-bit index wraparound. + * + * Note: If the number of apparently available descriptors is larger than the + * ring size (due to guest misbehavior), this check will still report the + * positive count of descriptors. + */ +uint16_t +viona_ring_num_avail(viona_vring_t *ring) +{ + volatile uint16_t *avail_idx = + viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); + + return (*avail_idx - ring->vr_cur_aidx); +} diff --git a/usr/src/uts/intel/io/viona/viona_rx.c b/usr/src/uts/intel/io/viona/viona_rx.c new file mode 100644 index 0000000000..2fbf6be972 --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_rx.c @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/strsubr.h> + +#include <sys/dlpi.h> +#include <sys/pattr.h> +#include <sys/vlan.h> + +#include "viona_impl.h" + + + +#define VTNET_MAXSEGS 32 + +/* Min. octets in an ethernet frame minus FCS */ +#define MIN_BUF_SIZE 60 +#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ) + +static mblk_t *viona_vlan_pad_mp; + +void +viona_rx_init(void) +{ + mblk_t *mp; + + ASSERT(viona_vlan_pad_mp == NULL); + + /* Create mblk for padding when VLAN tags are stripped */ + mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL); + bzero(mp->b_rptr, VLAN_TAGSZ); + mp->b_wptr += VLAN_TAGSZ; + viona_vlan_pad_mp = mp; +} + +void +viona_rx_fini(void) +{ + mblk_t *mp; + + /* Clean up the VLAN padding mblk */ + mp = viona_vlan_pad_mp; + viona_vlan_pad_mp = NULL; + VERIFY(mp != NULL && mp->b_cont == NULL); + freemsg(mp); +} + +void +viona_worker_rx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_rx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + viona_ring_disable_notify(ring); + + do { + if (vmm_drv_lease_expired(ring->vr_lease)) { + /* + * Set the renewal flag, causing incoming traffic to be + * dropped, and issue an RX barrier to ensure any + * threads in the RX callbacks will have finished. + * The vr_lock cannot be held across the barrier as it + * poses a deadlock risk. + */ + ring->vr_state_flags |= VRSF_RENEW; + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + + /* + * For now, there is little to do in the RX worker as inbound + * data is delivered by MAC via the RX callbacks. If tap-like + * functionality is added later, this would be a convenient + * place to inject frames into the guest. + */ + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + } while (!VRING_NEED_BAIL(ring, p)); + + ring->vr_state = VRS_STOP; + + /* + * The RX ring is stopping, before we start tearing it down it + * is imperative that we perform an RX barrier so that + * incoming packets are dropped at viona_rx_classified(). + */ + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + viona_ring_enable_notify(ring); +} + +static size_t +viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len, + boolean_t *end) +{ + size_t copied = 0; + size_t off = 0; + + /* Seek past already-consumed data */ + while (seek > 0 && mp != NULL) { + const size_t chunk = MBLKL(mp); + + if (chunk > seek) { + off = seek; + break; + } + mp = mp->b_cont; + seek -= chunk; + } + + while (mp != NULL) { + const size_t chunk = MBLKL(mp) - off; + const size_t to_copy = MIN(chunk, len); + + bcopy(mp->b_rptr + off, buf, to_copy); + copied += to_copy; + buf += to_copy; + len -= to_copy; + + /* + * If all the remaining data in the mblk_t was copied, move on + * to the next one in the chain. Any seek offset applied to + * the first mblk copy is zeroed out for subsequent operations. + */ + if (chunk == to_copy) { + mp = mp->b_cont; + off = 0; + } +#ifdef DEBUG + else { + /* + * The only valid reason for the copy to consume less + * than the entire contents of the mblk_t is because + * the output buffer has been filled. + */ + ASSERT0(len); + } +#endif + + /* Go no further if the buffer has been filled */ + if (len == 0) { + break; + } + + } + *end = (mp == NULL); + return (copied); +} + +static int +viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + uint16_t cookie; + int n; + const size_t hdr_sz = sizeof (struct virtio_net_hdr); + struct virtio_net_hdr *hdr; + size_t len, copied = 0; + caddr_t buf = NULL; + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + vmm_page_t *pages = NULL; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie, &pages); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + goto bad_frame; + } + + /* Grab the address of the header before anything else */ + hdr = (struct virtio_net_hdr *)iov[0].iov_base; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. + */ + if (iov[0].iov_len > hdr_sz) { + buf = (caddr_t)iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Copy any remaining data into subsequent buffers, if present */ + for (int i = 1; i < n && !end; i++) { + buf = (caddr_t)iov[i].iov_base; + len = iov[i].iov_len; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Was the expected amount of data copied? */ + if (copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + goto bad_frame; + } + + /* Populate (read: zero) the header and account for it in the size */ + bzero(hdr, hdr_sz); + copied += hdr_sz; + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + + /* Release this chain */ + vmm_drv_page_release_chain(pages); + vq_pushchain(ring, copied, cookie); + return (0); + +bad_frame: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie, + mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + + vmm_drv_page_release_chain(pages); + vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie); + return (EINVAL); +} + +static int +viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + used_elem_t uelem[VTNET_MAXSEGS]; + vmm_page_t *pages = NULL, *hdr_pages = NULL; + int n, i = 0, buf_idx = 0, err = 0; + uint16_t cookie; + caddr_t buf; + size_t len, copied = 0, chunk = 0; + struct virtio_net_mrgrxhdr *hdr = NULL; + const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr); + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie, &hdr_pages); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, no_space); + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + uelem[0].id = cookie; + uelem[0].len = iov[0].iov_len; + err = EINVAL; + goto done; + } + + /* Grab the address of the header and do initial population */ + hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base; + bzero(hdr, hdr_sz); + hdr->vrh_bufs = 1; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. The size of the header itself + * is accounted for later. + */ + if (iov[0].iov_len > hdr_sz) { + buf = iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + size_t copy_len; + copy_len = viona_copy_mblk(mp, copied, buf, len, &end); + chunk += copy_len; + copied += copy_len; + } + i = 1; + + do { + while (i < n && !end) { + buf = iov[i].iov_base; + len = iov[i].iov_len; + + size_t copy_len; + copy_len = viona_copy_mblk(mp, copied, buf, len, &end); + chunk += copy_len; + copied += copy_len; + i++; + } + + uelem[buf_idx].id = cookie; + uelem[buf_idx].len = chunk; + + /* + * Try to grab another buffer from the ring if the mblk has not + * yet been entirely copied out. + */ + if (!end) { + if (buf_idx == (VTNET_MAXSEGS - 1)) { + /* + * Our arbitrary limit on the number of buffers + * to offer for merge has already been reached. + */ + err = EOVERFLOW; + break; + } + if (pages != NULL) { + vmm_drv_page_release_chain(pages); + pages = NULL; + } + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie, + &pages); + if (n <= 0) { + /* + * Without more immediate space to perform the + * copying, there is little choice left but to + * drop the packet. + */ + err = EMSGSIZE; + break; + } + chunk = 0; + i = 0; + buf_idx++; + /* + * Keep the header up-to-date with the number of + * buffers, but never reference its value since the + * guest could meddle with it. + */ + hdr->vrh_bufs++; + } + } while (!end && copied < msz); + + /* Account for the header size in the first buffer */ + uelem[0].len += hdr_sz; + + /* + * If no other errors were encounted during the copy, was the expected + * amount of data transfered? + */ + if (err == 0 && copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + err = EINVAL; + } + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + +done: + switch (err) { + case 0: + /* Success can fall right through to ring delivery */ + break; + + case EMSGSIZE: + VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_underrun); + break; + + case EOVERFLOW: + VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_overrun); + break; + + default: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + } + + if (hdr_pages != NULL) { + vmm_drv_page_release_chain(hdr_pages); + } + if (pages != NULL) { + vmm_drv_page_release_chain(pages); + } + vq_pushchain_many(ring, buf_idx + 1, uelem); + return (err); +} + +static void +viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback) +{ + viona_link_t *link = ring->vr_link; + mblk_t *mprx = NULL, **mprx_prevp = &mprx; + mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop; + const boolean_t do_merge = + ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); + + size_t nrx = 0, ndrop = 0; + + while (mp != NULL) { + mblk_t *next = mp->b_next; + mblk_t *pad = NULL; + size_t size = msgsize(mp); + int err = 0; + + mp->b_next = NULL; + + /* + * We treat both a 'drop' response and errors the same here + * and put the packet on the drop chain. As packets may be + * subject to different actions in ipf (which do not all + * return the same set of error values), an error processing + * one packet doesn't mean the next packet will also generate + * an error. + */ + if (VNETHOOK_INTERESTED_IN(link->l_neti) && + viona_hook(link, ring, &mp, B_FALSE) != 0) { + if (mp != NULL) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + } else { + /* + * If the hook consumer (e.g. ipf) already + * freed the mblk_t, update the drop count now. + */ + ndrop++; + } + mp = next; + continue; + } + + /* + * Ethernet frames are expected to be padded out in order to + * meet the minimum size. + * + * A special case is made for frames which are short by + * VLAN_TAGSZ, having been stripped of their VLAN tag while + * traversing MAC. A preallocated (and recycled) mblk is used + * for that specific condition. + * + * All other frames that fall short on length will have custom + * zero-padding allocated appended to them. + */ + if (size == NEED_VLAN_PAD_SIZE) { + ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ); + ASSERT(viona_vlan_pad_mp->b_cont == NULL); + + for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont) + ; + + pad->b_cont = viona_vlan_pad_mp; + size += VLAN_TAGSZ; + } else if (size < MIN_BUF_SIZE) { + const size_t pad_size = MIN_BUF_SIZE - size; + mblk_t *zero_mp; + + zero_mp = allocb(pad_size, BPRI_MED); + if (zero_mp == NULL) { + err = ENOMEM; + goto pad_drop; + } + + VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring, + mblk_t *, mp, size_t, pad_size); + VIONA_RING_STAT_INCR(ring, rx_pad_short); + zero_mp->b_wptr += pad_size; + bzero(zero_mp->b_rptr, pad_size); + linkb(mp, zero_mp); + size += pad_size; + } + + if (do_merge) { + err = viona_recv_merged(ring, mp, size); + } else { + err = viona_recv_plain(ring, mp, size); + } + + /* + * The VLAN padding mblk is meant for continual reuse, so + * remove it from the chain to prevent it from being freed. + * + * Custom allocated padding does not require this treatment and + * is freed normally. + */ + if (pad != NULL) { + pad->b_cont = NULL; + } + +pad_drop: + /* + * While an error during rx processing + * (viona_recv_{merged,plain}) does not free mp on error, + * hook processing might or might not free mp. Handle either + * scenario -- if mp is not yet free, it is queued up and + * freed after the guest has been notified. If mp is + * already NULL, just proceed on. + */ + if (err != 0) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + + /* + * If the available ring is empty, do not bother + * attempting to deliver any more frames. Count the + * rest as dropped too. + */ + if (err == ENOSPC) { + mp->b_next = next; + break; + } + } else { + /* Chain successful mblks to be freed later */ + *mprx_prevp = mp; + mprx_prevp = &mp->b_next; + nrx++; + } + mp = next; + } + + membar_enter(); + viona_intr_ring(ring, B_FALSE); + + /* Free successfully received frames */ + if (mprx != NULL) { + freemsgchain(mprx); + } + + /* Free dropped frames, also tallying them */ + mp = mpdrop; + while (mp != NULL) { + mblk_t *next = mp->b_next; + + mp->b_next = NULL; + freemsg(mp); + mp = next; + ndrop++; + } + VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop); +} + +static void +viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + viona_rx_common(ring, mp, is_loopback); +} + +static void +viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + mac_handle_t mh = ring->vr_link->l_mh; + mblk_t *mp_mcast_only = NULL; + mblk_t **mpp = &mp_mcast_only; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + /* + * In addition to multicast traffic, broadcast packets will also arrive + * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback + * for fully-classified traffic has already delivered that broadcast + * traffic, so it should be suppressed here, rather than duplicating it + * to the guest. + */ + while (mp != NULL) { + mblk_t *mp_next; + mac_header_info_t mhi; + int err; + + mp_next = mp->b_next; + mp->b_next = NULL; + + /* Determine the packet type */ + err = mac_vlan_header_info(mh, mp, &mhi); + if (err != 0) { + mblk_t *pull; + + /* + * It is possible that gathering of the header + * information was impeded by a leading mblk_t which + * was of inadequate length to reference the needed + * fields. Try again, in case that could be solved + * with a pull-up. + */ + pull = msgpullup(mp, sizeof (struct ether_vlan_header)); + if (pull == NULL) { + err = ENOMEM; + } else { + err = mac_vlan_header_info(mh, pull, &mhi); + freemsg(pull); + } + + if (err != 0) { + VIONA_RING_STAT_INCR(ring, rx_mcast_check); + } + } + + /* Chain up matching packets while discarding others */ + if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) { + *mpp = mp; + mpp = &mp->b_next; + } else { + freemsg(mp); + } + + mp = mp_next; + } + + if (mp_mcast_only != NULL) { + viona_rx_common(ring, mp_mcast_only, is_loopback); + } +} + +int +viona_rx_set(viona_link_t *link) +{ + viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX]; + int err; + + mac_rx_set(link->l_mch, viona_rx_classified, ring); + err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI, + viona_rx_mcast, ring, &link->l_mph, + MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP); + if (err != 0) { + mac_rx_clear(link->l_mch); + } + + return (err); +} + +void +viona_rx_clear(viona_link_t *link) +{ + mac_promisc_remove(link->l_mph); + mac_rx_clear(link->l_mch); +} diff --git a/usr/src/uts/intel/io/viona/viona_tx.c b/usr/src/uts/intel/io/viona/viona_tx.c new file mode 100644 index 0000000000..424deee498 --- /dev/null +++ b/usr/src/uts/intel/io/viona/viona_tx.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + + +#include <sys/types.h> +#include <sys/smt.h> +#include <sys/strsubr.h> + +#include <sys/pattr.h> +#include <sys/dlpi.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> + +#include "viona_impl.h" + +#define BNXE_NIC_DRIVER "bnxe" + +/* + * copy tx mbufs from virtio ring to avoid necessitating a wait for packet + * transmission to free resources. + */ +kmutex_t viona_force_copy_lock; +static enum viona_force_copy { + VFC_UNINITALIZED = 0, + VFC_COPY_UNEEDED = 1, + VFC_COPY_REQUIRED = 2, +} viona_force_copy_state = VFC_UNINITALIZED; + +struct viona_desb { + frtn_t d_frtn; + viona_vring_t *d_ring; + uint_t d_ref; + uint32_t d_len; + uint16_t d_cookie; + uchar_t *d_headers; + vmm_page_t *d_pages; +}; + +static void viona_tx(viona_link_t *, viona_vring_t *); +static void viona_desb_release(viona_desb_t *); + + +static void +viona_tx_wait_outstanding(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + while (ring->vr_xfer_outstanding != 0) { + /* + * Paying heed to signals is counterproductive here. This is a + * very tight loop if pending transfers take an extended amount + * of time to be reclaimed while the host process is exiting. + */ + cv_wait(&ring->vr_cv, &ring->vr_lock); + } +} + +/* + * Check if full TX packet copying is needed. This should not be called from + * viona attach()/detach() context. + */ +static boolean_t +viona_tx_copy_needed(void) +{ + boolean_t result; + + mutex_enter(&viona_force_copy_lock); + if (viona_force_copy_state == VFC_UNINITALIZED) { + major_t bnxe_major; + + /* + * The original code for viona featured an explicit check for + * the bnxe driver which, when found present, necessitated that + * all transmissions be copied into their own mblks instead of + * passing guest memory to the underlying device. + * + * The motivations for this are unclear, but until it can be + * proven unnecessary, the check lives on. + */ + viona_force_copy_state = VFC_COPY_UNEEDED; + if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) + != DDI_MAJOR_T_NONE) { + if (ddi_hold_installed_driver(bnxe_major) != NULL) { + viona_force_copy_state = VFC_COPY_REQUIRED; + ddi_rele_driver(bnxe_major); + } + } + } + result = (viona_force_copy_state == VFC_COPY_REQUIRED); + mutex_exit(&viona_force_copy_lock); + + return (result); +} + +void +viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) +{ + /* Allocate desb handles for TX ring if packet copying not disabled */ + if (!viona_tx_copy_needed()) { + viona_desb_t *dp; + + dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); + ring->vr_txdesb = dp; + for (uint_t i = 0; i < qsz; i++, dp++) { + dp->d_frtn.free_func = viona_desb_release; + dp->d_frtn.free_arg = (void *)dp; + dp->d_ring = ring; + dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, + KM_SLEEP); + } + } + + /* Allocate ring-sized iovec buffers for TX */ + ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); +} + +void +viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) +{ + if (ring->vr_txdesb != NULL) { + viona_desb_t *dp = ring->vr_txdesb; + + for (uint_t i = 0; i < qsz; i++, dp++) { + kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); + } + kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); + ring->vr_txdesb = NULL; + } + + if (ring->vr_txiov != NULL) { + kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); + ring->vr_txiov = NULL; + } +} + +static void +viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + vq_pushchain(ring, len, cookie); + + membar_enter(); + viona_intr_ring(ring, B_FALSE); +} + +void +viona_worker_tx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_tx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + mutex_exit(&ring->vr_lock); + + for (;;) { + boolean_t bail = B_FALSE; + boolean_t renew = B_FALSE; + uint_t ntx = 0; + + viona_ring_disable_notify(ring); + while (viona_ring_num_avail(ring)) { + viona_tx(link, ring); + + /* + * It is advantageous for throughput to keep this + * transmission loop tight, but periodic breaks to + * check for other events are of value too. + */ + if (ntx++ >= ring->vr_size) + break; + } + viona_ring_enable_notify(ring); + + VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); + + /* + * Check for available descriptors on the ring once more in + * case a late addition raced with the NO_NOTIFY flag toggle. + * + * The barrier ensures that visibility of the no-notify + * store does not cross the viona_ring_num_avail() check below. + */ + membar_enter(); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + if (!bail && !renew && viona_ring_num_avail(ring)) { + continue; + } + + if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { + /* + * The NOTIFY_ON_EMPTY interrupt should not pay heed to + * the presence of AVAIL_NO_INTERRUPT. + */ + viona_intr_ring(ring, B_TRUE); + } + + mutex_enter(&ring->vr_lock); + + while (!bail && !renew && !viona_ring_num_avail(ring)) { + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + } + + if (bail) { + break; + } else if (renew) { + ring->vr_state_flags |= VRSF_RENEW; + /* + * When renewing the lease for the ring, no TX + * frames may be outstanding, as they contain + * references to guest memory. + */ + viona_tx_wait_outstanding(ring); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + mutex_exit(&ring->vr_lock); + } + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_state = VRS_STOP; + viona_tx_wait_outstanding(ring); +} + +static void +viona_desb_release(viona_desb_t *dp) +{ + viona_vring_t *ring = dp->d_ring; + uint_t ref; + uint32_t len; + uint16_t cookie; + + ref = atomic_dec_uint_nv(&dp->d_ref); + if (ref > 1) { + return; + } + + /* + * The desb corresponding to this index must be ready for reuse before + * the descriptor is returned to the guest via the 'used' ring. + */ + len = dp->d_len; + cookie = dp->d_cookie; + dp->d_len = 0; + dp->d_cookie = 0; + vmm_drv_page_release_chain(dp->d_pages); + dp->d_pages = NULL; + + /* + * Ensure all other changes to the desb are visible prior to zeroing its + * refcount, signifying its readiness for reuse. + */ + membar_exit(); + dp->d_ref = 0; + + viona_tx_done(ring, len, cookie); + + mutex_enter(&ring->vr_lock); + if ((--ring->vr_xfer_outstanding) == 0) { + cv_broadcast(&ring->vr_cv); + } + mutex_exit(&ring->vr_lock); +} + +static boolean_t +viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, + mblk_t *mp, uint32_t len) +{ + viona_link_t *link = ring->vr_link; + const struct ether_header *eth; + uint_t eth_len = sizeof (struct ether_header); + ushort_t ftype; + ipha_t *ipha = NULL; + uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ + uint16_t flags = 0; + const uint_t csum_start = hdr->vrh_csum_start; + const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; + + /* + * Validate that the checksum offsets provided by the guest are within + * the bounds of the packet. Additionally, ensure that the checksum + * contents field is within the headers mblk copied by viona_tx(). + */ + if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || + (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } + + /* + * This is guaranteed to be safe thanks to the header copying + * done in viona_tx(). + */ + eth = (const struct ether_header *)mp->b_rptr; + ftype = ntohs(eth->ether_type); + + if (ftype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *veth; + + /* punt on QinQ for now */ + eth_len = sizeof (struct ether_vlan_header); + veth = (const struct ether_vlan_header *)eth; + ftype = ntohs(veth->ether_type); + } + + if (ftype == ETHERTYPE_IP) { + ipha = (ipha_t *)(mp->b_rptr + eth_len); + + ipproto = ipha->ipha_protocol; + } else if (ftype == ETHERTYPE_IPV6) { + ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); + + ipproto = ip6h->ip6_nxt; + } + + /* + * We ignore hdr_len because the spec says it can't be + * trusted. Besides, our own stack will determine the header + * boundary. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && + ftype == ETHERTYPE_IP) { + uint16_t *cksump; + uint32_t cksum; + ipaddr_t src = ipha->ipha_src; + ipaddr_t dst = ipha->ipha_dst; + + /* + * Our native IP stack doesn't set the L4 length field + * of the pseudo header when LSO is in play. Other IP + * stacks, e.g. Linux, do include the length field. + * This is a problem because the hardware expects that + * the length field is not set. When it is set it will + * cause an incorrect TCP checksum to be generated. + * The reason this works in Linux is because Linux + * corrects the pseudo-header checksum in the driver + * code. In order to get the correct HW checksum we + * need to assume the guest's IP stack gave us a bogus + * TCP partial checksum and calculate it ourselves. + */ + cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); + cksum = IP_TCP_CSUM_COMP; + cksum += (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); + + /* + * Since viona is a "legacy device", the data stored + * by the driver will be in the guest's native endian + * format (see sections 2.4.3 and 5.1.6.1 of the + * VIRTIO 1.0 spec for more info). At this time the + * only guests using viona are x86 and we can assume + * little-endian. + */ + lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); + + /* + * Hardware, like ixgbe, expects the client to request + * IP header checksum offload if it's sending LSO (see + * ixgbe_get_context()). Unfortunately, virtio makes + * no allowances for negotiating IP header checksum + * and HW offload, only TCP checksum. We add the flag + * and zero-out the checksum field. This mirrors the + * behavior of our native IP stack (which does this in + * the interest of HW that expects the field to be + * zero). + */ + flags |= HCK_IPV4_HDRCKSUM; + ipha->ipha_hdr_checksum = 0; + } + + /* + * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure + * HW_LSO, if present, is not lost. + */ + flags |= DB_CKSUMFLAGS(mp); + + /* + * Partial checksum support from the NIC is ideal, since it most + * closely maps to the interface defined by virtio. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + /* + * MAC expects these offsets to be relative to the + * start of the L3 header rather than the L2 frame. + */ + flags |= HCK_PARTIALCKSUM; + mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, + len - eth_len, 0, flags); + return (B_TRUE); + } + + /* + * Without partial checksum support, look to the L3/L4 protocol + * information to see if the NIC can handle it. If not, the + * checksum will need to calculated inline. + */ + if (ftype == ETHERTYPE_IP) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } else if (ftype == ETHERTYPE_IPV6) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum6); + return (B_FALSE); + } + + /* Cannot even emulate hcksum for unrecognized protocols */ + VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); + return (B_FALSE); +} + +static void +viona_tx(viona_link_t *link, viona_vring_t *ring) +{ + struct iovec *iov = ring->vr_txiov; + const uint_t max_segs = ring->vr_size; + uint16_t cookie; + int i, n; + uint32_t len, base_off = 0; + uint32_t min_copy = VIONA_MAX_HDRS_LEN; + mblk_t *mp_head, *mp_tail, *mp; + viona_desb_t *dp = NULL; + mac_client_handle_t link_mch = link->l_mch; + const struct virtio_net_hdr *hdr; + vmm_page_t *pages = NULL; + + mp_head = mp_tail = NULL; + + ASSERT(iov != NULL); + + n = vq_popchain(ring, iov, max_segs, &cookie, &pages); + if (n == 0) { + VIONA_PROBE1(tx_absent, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, tx_absent); + return; + } else if (n < 0) { + /* + * Any error encountered in vq_popchain has already resulted in + * specific probe and statistic handling. Further action here + * is unnecessary. + */ + return; + } + + /* Grab the header and ensure it is of adequate length */ + hdr = (const struct virtio_net_hdr *)iov[0].iov_base; + len = iov[0].iov_len; + if (len < sizeof (struct virtio_net_hdr)) { + goto drop_fail; + } + + /* Make sure the packet headers are always in the first mblk. */ + if (ring->vr_txdesb != NULL) { + dp = &ring->vr_txdesb[cookie]; + + /* + * If the guest driver is operating properly, each desb slot + * should be available for use when processing a TX descriptor + * from the 'avail' ring. In the case of drivers that reuse a + * descriptor before it has been posted to the 'used' ring, the + * data is simply dropped. + */ + if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { + dp = NULL; + goto drop_fail; + } + + dp->d_cookie = cookie; + mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, + &dp->d_frtn); + + /* Account for the successful desballoc. */ + if (mp_head != NULL) + dp->d_ref++; + } else { + mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); + } + + if (mp_head == NULL) + goto drop_fail; + + mp_tail = mp_head; + + /* + * We always copy enough of the guest data to cover the + * headers. This protects us from TOCTOU attacks and allows + * message block length assumptions to be made in subsequent + * code. In many cases, this means copying more data than + * strictly necessary. That's okay, as it is the larger packets + * (such as LSO) that really benefit from desballoc(). + */ + for (i = 1; i < n; i++) { + const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); + + bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); + mp_head->b_wptr += to_copy; + len += to_copy; + min_copy -= to_copy; + + /* + * We've met the minimum copy requirement. The rest of + * the guest data can be referenced. + */ + if (min_copy == 0) { + /* + * If we copied all contents of this + * descriptor then move onto the next one. + * Otherwise, record how far we are into the + * current descriptor. + */ + if (iov[i].iov_len == to_copy) + i++; + else + base_off = to_copy; + + break; + } + } + + ASSERT3P(mp_head, !=, NULL); + ASSERT3P(mp_tail, !=, NULL); + + for (; i < n; i++) { + uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; + uint32_t chunk = iov[i].iov_len - base_off; + + ASSERT3U(base_off, <, iov[i].iov_len); + ASSERT3U(chunk, >, 0); + + if (dp != NULL) { + mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); + if (mp == NULL) { + goto drop_fail; + } + dp->d_ref++; + } else { + mp = allocb(chunk, BPRI_MED); + if (mp == NULL) { + goto drop_fail; + } + bcopy((uchar_t *)base, mp->b_wptr, chunk); + } + + base_off = 0; + len += chunk; + mp->b_wptr += chunk; + mp_tail->b_cont = mp; + mp_tail = mp; + } + + if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { + /* + * The hook consumer may elect to free the mblk_t and set + * our mblk_t ** to NULL. When using a viona_desb_t + * (dp != NULL), we do not want the corresponding cleanup to + * occur during the viona_hook() call. We instead want to + * reset and recycle dp for future use. To prevent cleanup + * during the viona_hook() call, we take a ref on dp (if being + * used), and release it on success. On failure, the + * freemsgchain() call will release all the refs taken earlier + * in viona_tx() (aside from the initial ref and the one we + * take), and drop_hook will reset dp for reuse. + */ + if (dp != NULL) + dp->d_ref++; + + /* + * Pass &mp instead of &mp_head so we don't lose track of + * mp_head if the hook consumer (i.e. ipf) elects to free mp + * and set mp to NULL. + */ + mp = mp_head; + if (viona_hook(link, ring, &mp, B_TRUE) != 0) { + if (mp != NULL) + freemsgchain(mp); + goto drop_hook; + } + + if (dp != NULL) { + dp->d_ref--; + + /* + * It is possible that the hook(s) accepted the packet, + * but as part of its processing, it issued a pull-up + * which released all references to the desb. In that + * case, go back to acting like the packet is entirely + * copied (which it is). + */ + if (dp->d_ref == 1) { + dp->d_cookie = 0; + dp->d_ref = 0; + dp = NULL; + } + } + } + + /* + * Request hardware checksumming, if necessary. If the guest + * sent an LSO packet then it must have also negotiated and + * requested partial checksum; therefore the LSO logic is + * contained within viona_tx_csum(). + */ + if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && + (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { + if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { + goto drop_fail; + } + } + + if (dp != NULL) { + dp->d_len = len; + dp->d_pages = pages; + mutex_enter(&ring->vr_lock); + ring->vr_xfer_outstanding++; + mutex_exit(&ring->vr_lock); + } else { + /* + * If the data was cloned out of the ring, the descriptors can + * be marked as 'used' now, rather than deferring that action + * until after successful packet transmission. + */ + vmm_drv_page_release_chain(pages); + viona_tx_done(ring, len, cookie); + } + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + smt_begin_unsafe(); + mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); + smt_end_unsafe(); + return; + +drop_fail: + /* + * On the off chance that memory is not available via the desballoc or + * allocb calls, there are few options left besides to fail and drop + * the frame on the floor. + */ + + if (dp != NULL) { + /* + * Take an additional reference on the desb handle (if present) + * so any desballoc-sourced mblks can release their hold on it + * without the handle reaching its final state and executing + * its clean-up logic. + */ + dp->d_ref++; + } + + /* + * Free any already-allocated blocks and sum up the total length of the + * dropped data to be released to the used ring. + */ + freemsgchain(mp_head); + +drop_hook: + len = 0; + for (uint_t i = 0; i < n; i++) { + len += iov[i].iov_len; + } + + if (dp != NULL) { + VERIFY(dp->d_ref == 2); + + /* Clean up the desb handle, releasing the extra hold. */ + dp->d_len = 0; + dp->d_cookie = 0; + dp->d_ref = 0; + } + + VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, + uint16_t, cookie); + vmm_drv_page_release_chain(pages); + viona_tx_done(ring, len, cookie); +} diff --git a/usr/src/uts/intel/io/vmm/Makefile.rules b/usr/src/uts/intel/io/vmm/Makefile.rules new file mode 100644 index 0000000000..1551659cc4 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/Makefile.rules @@ -0,0 +1,48 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# Copyright 2022 Oxide Computer Company +# + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/amd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/intel/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/intel/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmm/amd/%.s + $(COMPILE.s) -o $@ $< + +$(ASSYM_VMX): $(OFFSETS_VMX) $(GENASSYM) + $(OFFSETS_CREATE) -I$(UTSBASE)/intel/io/vmm < $(OFFSETS_VMX) >$@ +$(ASSYM_SVM): $(OFFSETS_SVM) $(GENASSYM) + $(OFFSETS_CREATE) -I$(UTSBASE)/intel/io/vmm < $(OFFSETS_SVM) >$@ + +$(OBJS_DIR)/vmx_support.o: $(ASSYM_VMX) +$(OBJS_DIR)/svm_support.o: $(ASSYM_SVM) diff --git a/usr/src/uts/intel/io/vmm/Makefile.vmm b/usr/src/uts/intel/io/vmm/Makefile.vmm new file mode 100644 index 0000000000..920da5dee6 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/Makefile.vmm @@ -0,0 +1,85 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# Copyright 2022 Oxide Computer Company +# + +CONF_SRCDIR = $(UTSBASE)/intel/io/vmm +MAPFILE = $(UTSBASE)/intel/io/vmm/vmm.mapfile + +PRE_INC_PATH = \ + -I$(COMPAT)/bhyve \ + -I$(COMPAT)/bhyve/amd64 \ + -I$(CONTRIB)/bhyve \ + -I$(CONTRIB)/bhyve/amd64 + +INC_PATH += -I$(UTSBASE)/intel/io/vmm -I$(UTSBASE)/intel/io/vmm/io +AS_INC_PATH += -I$(UTSBASE)/intel/io/vmm -I$(OBJS_DIR) + +# enable collection of VMM statistics +CFLAGS += -DVMM_KEEP_STATS + +LDFLAGS += -N misc/acpica -N misc/pcie -N fs/dev +LDFLAGS += -M $(MAPFILE) + +# 3rd party code +SMOFF += all_func_returns + +# needs work +$(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check + +OFFSETS_VMX = $(CONF_SRCDIR)/intel/offsets.in +OFFSETS_SVM = $(CONF_SRCDIR)/amd/offsets.in +ASSYM_VMX = $(OBJS_DIR)/vmx_assym.h +ASSYM_SVM = $(OBJS_DIR)/svm_assym.h + +CLEANFILES += $(ASSYM_VMX) $(ASSYM_SVM) + +VMM_OBJS = \ + vmm.o \ + vmm_sol_dev.o \ + vmm_host.o \ + vmm_instruction_emul.o \ + vmm_ioport.o \ + vmm_lapic.o \ + vmm_stat.o \ + vmm_util.o \ + x86.o \ + iommu.o \ + vatpic.o \ + vatpit.o \ + vhpet.o \ + vioapic.o \ + vlapic.o \ + vrtc.o \ + vpmtmr.o \ + vmcs.o \ + vmx_msr.o \ + vmx.o \ + vmx_support.o \ + vtd.o \ + vtd_sol.o \ + svm.o \ + svm_msr.o \ + vmcb.o \ + svm_support.o \ + amdv.o \ + vmm_gpt.o \ + seg_vmm.o \ + vmm_reservoir.o \ + vmm_sol_glue.o \ + vmm_sol_ept.o \ + vmm_sol_rvi.o \ + vmm_support.o \ + vmm_vm.o \ + vmm_zsd.o diff --git a/usr/src/uts/intel/io/vmm/README.license b/usr/src/uts/intel/io/vmm/README.license new file mode 100644 index 0000000000..55ad5d596d --- /dev/null +++ b/usr/src/uts/intel/io/vmm/README.license @@ -0,0 +1,5 @@ +Having been ported from FreeBSD, bhyve bears the BSD license. Subsequent +changes made to bhyve in illumos are dual-licensed under both the BSD license +and the CDDL. Use or redistribution of those subsequent changes may be done +under either license. The CDDL license header added to bhyve source files is +meant to cover only those dual-licensed modifications, not the entire file. diff --git a/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE b/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE new file mode 100644 index 0000000000..66b39dc950 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE @@ -0,0 +1,26 @@ + +SPDX-License-Identifier: BSD-2-Clause-FreeBSD + +Copyright (c) 1992-2020 The FreeBSD Project. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + diff --git a/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE.descrip b/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..77026fc8a3 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +Bhyve hypervisor diff --git a/usr/src/uts/intel/io/vmm/amd/amdiommu.c b/usr/src/uts/intel/io/vmm/amd/amdiommu.c new file mode 100644 index 0000000000..4dd13b0195 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/amdiommu.c @@ -0,0 +1,185 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 The FreeBSD Foundation + * + * Portions of this software were developed by Ka Ho Ng + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/rman.h> + +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +#include "amdvi_priv.h" +#include "ivhd_if.h" + +struct amdiommu_softc { + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_rid; +}; + +static int amdiommu_probe(device_t); +static int amdiommu_attach(device_t); +static int amdiommu_detach(device_t); +static int ivhd_setup_intr(device_t, driver_intr_t, void *, + const char *); +static int ivhd_teardown_intr(device_t); + +static device_method_t amdiommu_methods[] = { + /* device interface */ + DEVMETHOD(device_probe, amdiommu_probe), + DEVMETHOD(device_attach, amdiommu_attach), + DEVMETHOD(device_detach, amdiommu_detach), + DEVMETHOD(ivhd_setup_intr, ivhd_setup_intr), + DEVMETHOD(ivhd_teardown_intr, ivhd_teardown_intr), + DEVMETHOD_END +}; +static driver_t amdiommu_driver = { + "amdiommu", + amdiommu_methods, + sizeof (struct amdiommu_softc), +}; + +static int +amdiommu_probe(device_t dev) +{ + int error; + int capoff; + + /* + * Check base class and sub-class + */ + if (pci_get_class(dev) != PCIC_BASEPERIPH || + pci_get_subclass(dev) != PCIS_BASEPERIPH_IOMMU) + return (ENXIO); + + /* + * A IOMMU capability block carries a 0Fh capid. + */ + error = pci_find_cap(dev, PCIY_SECDEV, &capoff); + if (error) + return (ENXIO); + + /* + * bit [18:16] == 011b indicates the capability block is IOMMU + * capability block. If the field is not set to 011b, bail out. + */ + if ((pci_read_config(dev, capoff + 2, 2) & 0x7) != 0x3) + return (ENXIO); + + return (BUS_PROBE_SPECIFIC); +} + +static int +amdiommu_attach(device_t dev) +{ + + device_set_desc(dev, "AMD-Vi/IOMMU PCI function"); + return (0); +} + +static int +amdiommu_detach(device_t dev) +{ + + return (0); +} + +static int +ivhd_setup_intr(device_t dev, driver_intr_t handler, void *arg, + const char *desc) +{ + struct amdiommu_softc *sc; + int error, msicnt; + + sc = device_get_softc(dev); + msicnt = 1; + if (sc->event_res != NULL) + panic("%s is called without intr teardown", __func__); + sc->event_rid = 1; + + error = pci_alloc_msi(dev, &msicnt); + if (error) { + device_printf(dev, "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + sc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &sc->event_rid, RF_ACTIVE); + if (sc->event_res == NULL) { + device_printf(dev, "Unable to allocate event INTR resource.\n"); + error = ENOMEM; + goto fail; + } + + error = bus_setup_intr(dev, sc->event_res, INTR_TYPE_MISC | INTR_MPSAFE, + NULL, handler, arg, &sc->event_tag); + if (error) { + device_printf(dev, "Fail to setup event intr\n"); + goto fail; + } + + bus_describe_intr(dev, sc->event_res, sc->event_tag, "%s", desc); + return (0); + +fail: + ivhd_teardown_intr(dev); + return (error); +} + +static int +ivhd_teardown_intr(device_t dev) +{ + struct amdiommu_softc *sc; + + sc = device_get_softc(dev); + + if (sc->event_tag != NULL) { + bus_teardown_intr(dev, sc->event_res, sc->event_tag); + sc->event_tag = NULL; + } + if (sc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, sc->event_rid, + sc->event_res); + sc->event_res = NULL; + } + pci_release_msi(dev); + return (0); +} + +static devclass_t amdiommu_devclass; + +/* This driver has to be loaded before ivhd */ +DRIVER_MODULE(amdiommu, pci, amdiommu_driver, amdiommu_devclass, 0, 0); +MODULE_DEPEND(amdiommu, pci, 1, 1, 1); diff --git a/usr/src/uts/intel/io/vmm/amd/amdv.c b/usr/src/uts/intel/io/vmm/amd/amdv.c new file mode 100644 index 0000000000..b056ab86d2 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/amdv.c @@ -0,0 +1,148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> + +#include <machine/vmm.h> +#include "io/iommu.h" + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + +const struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_remove_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, + amd_iommu_invalidate_tlb, +}; diff --git a/usr/src/uts/intel/io/vmm/amd/amdvi_hw.c b/usr/src/uts/intel/io/vmm/amd/amdvi_hw.c new file mode 100644 index 0000000000..33a2557492 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/amdvi_hw.c @@ -0,0 +1,1379 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/sysctl.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "ivhd_if.h" +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc, int count); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | AMDVI_CMP_WAIT_STORE; + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + ctrl = softc->ctrl; + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + /* Dump the last command. */ + amdvi_dump_cmds(softc, 1); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc, int count) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump last %d command(s):\n", count); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && i < count; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = MOD_INC(off, sizeof(struct amdvi_cmd), softc->cmd_max); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + device_t mmio_dev; + + softc = device_get_softc(dev); + mmio_dev = softc->pci_dev; + + IVHD_TEARDOWN_INTR(mmio_dev); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, mmio_dev; + int err; + + dev = softc->dev; + mmio_dev = softc->pci_dev; + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + err = IVHD_SETUP_INTR(mmio_dev, amdvi_event_intr, softc, "fault"); + if (err) + device_printf(dev, "Interrupt setup failed on %s\n", + device_get_nameunit(mmio_dev)); + return (err); +} + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x] " + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\010LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static uint64_t +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (-1); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + else + return (len); +} + +static uint64_t +amdvi_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, 0, len, false)); + return + (len); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i, j; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + for (j = 0; j < softc->dev_cfg_cnt; j++) + if ((devid >= softc->dev_cfg[j].start_id) && + (devid <= softc->dev_cfg[j].end_id)) + return (softc); + } + + return (NULL); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, struct amdvi_softc *softc, + uint16_t devid, bool enable) +{ + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(struct amdvi_softc *softc, uint16_t devid) +{ + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static void +amdvi_add_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + struct amdvi_softc *softc; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + softc = amdvi_find_iommu(devid); + if (softc == NULL) + return; + amdvi_set_dte(domain, softc, devid, true); + amdvi_inv_device(softc, devid); +} + +static void +amdvi_remove_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + struct amdvi_softc *softc; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + softc = amdvi_find_iommu(devid); + if (softc == NULL) + return; + amdvi_set_dte(domain, softc, devid, false); + amdvi_inv_device(softc, devid); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static void +amdvi_invalidate_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); +} + +const struct iommu_ops iommu_ops_amd = { + .init = amdvi_init, + .cleanup = amdvi_cleanup, + .enable = amdvi_enable, + .disable = amdvi_disable, + .create_domain = amdvi_create_domain, + .destroy_domain = amdvi_destroy_domain, + .create_mapping = amdvi_create_mapping, + .remove_mapping = amdvi_remove_mapping, + .add_device = amdvi_add_device, + .remove_device = amdvi_remove_device, + .invalidate_tlb = amdvi_invalidate_tlb +}; diff --git a/usr/src/uts/intel/io/vmm/amd/amdvi_priv.h b/usr/src/uts/intel/io/vmm/amd/amdvi_priv.h new file mode 100644 index 0000000000..5b66c6fa4b --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/amdvi_priv.h @@ -0,0 +1,410 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * Copyright (c) 2021 The FreeBSD Foundation + * + * Portions of this software were developed by Ka Ho Ng + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include <contrib/dev/acpica/include/acpi.h> + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = ACPI_IVRS_TYPE_HARDWARE1, + /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = ACPI_IVRS_TYPE_HARDWARE2, + /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + device_t pci_dev; /* IOMMU PCI function device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + int event_max; /* Max number of events. */ + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + /* ACPI device configuration for end points. */ + struct ivhd_dev_cfg *dev_cfg; + int dev_cfg_cnt; + int dev_cfg_cap; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/usr/src/uts/intel/io/vmm/amd/ivrs_drv.c b/usr/src/uts/intel/io/vmm/amd/ivrs_drv.c new file mode 100644 index 0000000000..6721867dbe --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/ivrs_drv.c @@ -0,0 +1,760 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * Copyright (c) 2021 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/vmparam.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE1 **ivhd_hdrs; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + int *count; + + count = (int *)arg; + if (ivrs_is_ivhd(ivrs_he->Type)) + (*count)++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE1 * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE1 *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + KASSERT(softc->dev_cfg_cap >= softc->dev_cfg_cnt, + ("Impossible case: number of dev_cfg exceeding capacity")); + if (softc->dev_cfg_cap == softc->dev_cfg_cnt) { + if (softc->dev_cfg_cap == 0) + softc->dev_cfg_cap = 1; + else + softc->dev_cfg_cap <<= 2; + softc->dev_cfg = realloc(softc->dev_cfg, + sizeof(*softc->dev_cfg) * softc->dev_cfg_cap, M_DEVBUF, + M_WAITOK); + } + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE1 *ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = -1, range_end_id = -1, i; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE1); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE2); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + for (i = 0; i < softc->dev_cfg_cnt; i++) + softc->dev_cfg[i].data |= all_data; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + if (range_start_id != -1) { + device_printf(softc->dev, + "Unexpected start-of-range device entry\n"); + return (EINVAL); + } + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + if (range_start_id == -1) { + device_printf(softc->dev, + "Unexpected end-of-range device entry\n"); + return (EINVAL); + } + range_end_id = de->Id; + if (range_end_id < range_start_id) { + device_printf(softc->dev, + "Device entry range going backward\n"); + return (EINVAL); + } + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = -1; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + if (old->DeviceId == new->DeviceId) { + /* + * Newer IVRS header type take precedence. + */ + if (old->Type == IVRS_TYPE_HARDWARE_LEGACY && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) + return (true); + + /* + * Mixed format IVHD header type take precedence + * over fixed format IVHD header types. + */ + if (old->Type == IVRS_TYPE_HARDWARE_EFR && + new->Type == IVRS_TYPE_HARDWARE_MIXED) + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE1 *ivhd; + ACPI_STATUS status; + int i, j, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, &count); + if (!count) + return; + + ivhd_hdrs = kmem_zalloc(sizeof(void *) * count, KM_SLEEP); + for (i = 0; i < count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Scan for presence of legacy and non-legacy device type + * for same IOMMU device and override the old one. + * + * If there is no existing IVHD to the same IOMMU device, + * the IVHD header pointer is appended. + */ + for (j = 0; j < ivhd_count; j++) { + if (ivhd_is_newer(&ivhd_hdrs[j]->Header, &ivhd->Header)) + break; + } + ivhd_hdrs[j] = ivhd; + if (j == ivhd_count) + ivhd_count++; + } + + ivhd_devs = kmem_zalloc(sizeof(device_t) * ivhd_count, KM_SLEEP); + for (i = 0, j = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: cant find ivhd%d\n", i); + break; + } + } + j++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = j; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE1 *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\010PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004<b4>" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003<b2>" + "\004NXSup" + "\005GTSup" + "\006<b5>" + "\007IASup" + "\010GASup" + "\011HESup" + "\012PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\011PprOvrflwEarlySup" + "\012PPRAutoRspSup" + "\015BlKStopMrkSup" + "\016PerfOptSup" + "\017MsiCapMmioSup" + "\021GIOSup" + "\022HASup" + "\023EPHSup" + "\024AttrFWSup" + "\025HDSup" + "\027InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE1 * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE1 *ivhd; + ACPI_IVRS_HARDWARE2 *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + softc->pci_dev = pci_find_bsf(PCI_RID2BUS(ivhd->Header.DeviceId), + PCI_RID2SLOT(ivhd->Header.DeviceId), + PCI_RID2FUNC(ivhd->Header.DeviceId)); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->FeatureReporting; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE2 *)ivhd; + softc->ext_feature = ivhd_efr->EfrRegisterImage; + break; + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + goto fail; + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) + goto fail; + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + goto fail; + } + + return (0); + +fail: + free(softc->dev_cfg, M_DEVBUF); + return (status); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + free(softc->dev_cfg, M_DEVBUF); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +static devclass_t ivhd_devclass; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, + SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/usr/src/uts/intel/io/vmm/amd/offsets.in b/usr/src/uts/intel/io/vmm/amd/offsets.in new file mode 100644 index 0000000000..ad4ee7155a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/offsets.in @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2017 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ +#include <sys/types.h> +#include <sys/cpuvar.h> + +#include "amd/svm.h" + +svm_regctx + sctx_rbx SCTX_RBX + sctx_rcx SCTX_RCX + sctx_rbp SCTX_RBP + sctx_rdx SCTX_RDX + sctx_rdi SCTX_RDI + sctx_rsi SCTX_RSI + sctx_r8 SCTX_R8 + sctx_r9 SCTX_R9 + sctx_r10 SCTX_R10 + sctx_r11 SCTX_R11 + sctx_r12 SCTX_R12 + sctx_r13 SCTX_R13 + sctx_r14 SCTX_R14 + sctx_r15 SCTX_R15 + +/* Need access to GDT to restore TSS */ +cpu + cpu_m.mcpu_gdt CPU_GDT + +user_desc USER_DESC_SZ + +system_desc + ssd_type SSD_TYPE + +\#define GDT_KTSS_OFF _MUL(USER_DESC_SZ, GDT_KTSS) + +/* Necessary for TSS-related data */ +\#include <sys/segments.h> + +/* Pull in definition for MSR_GSBASE */ +\#include <machine/specialreg.h> diff --git a/usr/src/uts/intel/io/vmm/amd/svm.c b/usr/src/uts/intel/io/vmm/amd/svm.c new file mode 100644 index 0000000000..11c1e9c249 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm.c @@ -0,0 +1,2466 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#include <sys/x86_archext.h> +#include <sys/trap.h> + +#include <machine/cpufunc.h> +#include <machine/psl.h> +#include <machine/md_var.h> +#include <machine/reg.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <sys/vmm_instruction_emul.h> +#include <sys/vmm_vm.h> +#include <sys/vmm_kernel.h> + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ktr.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +/* SVM features advertised by CPUID.8000000AH:EDX */ +static uint32_t svm_feature = ~0U; /* AMD SVM features. */ + +static int disable_npf_assist; + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); +static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val); +static void flush_asid(struct svm_softc *sc, int vcpuid); + +static __inline bool +flush_by_asid(void) +{ + return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0); +} + +static __inline bool +decode_assist(void) +{ + return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0); +} + +static int +svm_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static int +svm_init(void) +{ + vmcb_clean &= VMCB_CACHE_DEFAULT; + + svm_msr_init(); + + return (0); +} + +static void +svm_restore(void) +{ + /* No-op on illumos */ +} + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, + int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %x to %x", idx, oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_ctrl = NP_ENABLE; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + } + + /* + * Selectively intercept writes to %cr0. This triggers on operations + * which would change bits other than TS or MP. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_CR0_WRITE); + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(sc->vm, vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_FERR_FREEZE); + + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* Intercept privileged invalidation instructions. */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA); + + /* + * Intercept all virtualization-related instructions. + * + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_ctrl |= V_INTR_MASKING; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->misc_ctrl |= LBR_VIRT_ENABLE; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + /* Set up DR6/7 to power-on state */ + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_vminit(struct vm *vm) +{ + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + int i; + uint16_t maxcpus; + + svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); + if (((uintptr_t)svm_sc & PAGE_MASK) != 0) + panic("malloc of svm_softc not aligned on page boundary"); + + svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->msr_bitmap == NULL) + panic("contigmalloc of SVM MSR bitmap failed"); + svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->iopm_bitmap == NULL) + panic("contigmalloc of SVM IO bitmap failed"); + + svm_sc->vm = vm; + svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm)); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); + + iopm_pa = vtophys(svm_sc->iopm_bitmap); + msrpm_pa = vtophys(svm_sc->msr_bitmap); + pml4_pa = svm_sc->nptp; + maxcpus = vm_get_maxcpus(svm_sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(&vcpu->vmcb); + vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); + svm_msr_guest_init(svm_sc, i); + } + return (svm_sc); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_state *state; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + struct vmcb_segment *seg; + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); + if (seg->attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct vm_inout *inout; + struct vie *vie; + uint64_t info1; + struct vm_guest_paging paging; + + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + inout = &vmexit->u.inout; + info1 = ctrl->exitinfo1; + + inout->bytes = (info1 >> 4) & 0x7; + inout->flags = 0; + inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0; + inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0; + inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0; + inout->port = (uint16_t)(info1 >> 16); + inout->eax = (uint32_t)(state->rax); + + if ((inout->flags & INOUT_STR) != 0) { + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * This is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (!decode_assist()) { + /* + * Without decoding assistance, force the task of + * emulating the ins/outs on userspace. + */ + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + bzero(&vmexit->u.inst_emul, + sizeof (vmexit->u.inst_emul)); + return (UNHANDLED); + } + + /* + * Bits 7-9 encode the address size of ins/outs operations where + * the 1/2/4 values correspond to 16/32/64 bit sizes. + */ + inout->addrsize = 2 * ((info1 >> 7) & 0x7); + VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || + inout->addrsize == 8); + + if (inout->flags & INOUT_IN) { + /* + * For INS instructions, %es (encoded as 0) is the + * implied segment for the operation. + */ + inout->segment = 0; + } else { + /* + * Bits 10-12 encode the segment for OUTS. + * This value follows the standard x86 segment order. + */ + inout->segment = (info1 >> 10) & 0x7; + } + } + + vmexit->exitcode = VM_EXITCODE_INOUT; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); + vie = vm_vie_ctx(svm_sc->vm, vcpu); + vie_init_inout(vie, inout, vmexit->inst_length, &paging); + + /* The in/out emulation will handle advancing %rip */ + vmexit->inst_length = 0; + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (PROT_EXEC); + else + return (PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, + uint64_t gpa) +{ + struct vmcb_ctrl *ctrl; + struct vmcb *vmcb; + struct vie *vie; + struct vm_guest_paging paging; + struct vmcb_segment *seg; + char *inst_bytes = NULL; + uint8_t inst_len = 0; + + vmcb = svm_get_vmcb(svm_sc, vcpu); + ctrl = &vmcb->ctrl; + + vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; + vmexit->u.mmio_emul.gpa = gpa; + vmexit->u.mmio_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, &paging); + + switch (paging.cpu_mode) { + case CPU_MODE_REAL: + seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); + vmexit->u.mmio_emul.cs_base = seg->base; + vmexit->u.mmio_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); + vmexit->u.mmio_emul.cs_base = seg->base; + + /* + * Section 4.8.1 of APM2, Default Operand Size or D bit. + */ + vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ? + 1 : 0; + break; + default: + vmexit->u.mmio_emul.cs_base = 0; + vmexit->u.mmio_emul.cs_d = 0; + break; + } + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = (char *)ctrl->inst_bytes; + } + vie = vm_vie_ctx(svm_sc->vm, vcpu); + vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa); +} + +/* + * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0 + * which is live in the guest. They are visible via the shadow instead. + */ +#define SVM_CR0_MASK ~(CR0_CD | CR0_NW | 0xffffffff00000000) + +static void +svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write) +{ + struct vmcb_state *state; + struct svm_regctx *regctx; + uint64_t masked, old, diff; + + state = svm_get_vmcb_state(svm_sc, vcpu); + regctx = svm_get_guest_regctx(svm_sc, vcpu); + + old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); + diff = old ^ val; + + /* No further work needed if register contents remain the same */ + if (diff == 0) { + return; + } + + /* Flush the TLB if the paging or write-protect bits are changing */ + if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) { + flush_asid(svm_sc, vcpu); + } + + /* + * If the change in %cr0 is due to a guest action (via interception) + * then other CPU state updates may be required. + */ + if (guest_write) { + if ((diff & CR0_PG) != 0) { + uint64_t efer = state->efer; + + /* Keep the long-mode state in EFER in sync */ + if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) { + state->efer |= EFER_LMA; + } + if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) { + state->efer &= ~EFER_LMA; + } + } + } + + masked = val & SVM_CR0_MASK; + regctx->sctx_cr0_shadow = val; + state->cr0 = masked; + svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR); + + if ((masked ^ val) != 0) { + /* + * The guest has set bits in %cr0 which we are masking out and + * exposing via shadow. + * + * We must intercept %cr0 reads in order to make the shadowed + * view available to the guest. + * + * Writes to %cr0 must also be intercepted (unconditionally, + * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch + * if/when the guest clears those shadowed bits. + */ + svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, + BIT(0) | BIT(16)); + } else { + /* + * When no bits remain in %cr0 which require shadowing, the + * unconditional intercept of reads/writes to %cr0 can be + * disabled. + * + * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains + * in place so we can be notified of operations which change + * bits other than TS or MP. + */ + svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, + BIT(0) | BIT(16)); + } + svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I); +} + +static void +svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val) +{ + struct vmcb *vmcb; + struct svm_regctx *regctx; + + vmcb = svm_get_vmcb(svm_sc, vcpu); + regctx = svm_get_guest_regctx(svm_sc, vcpu); + + /* + * Include the %cr0 bits which exist only in the shadow along with those + * in the running vCPU state. + */ + *val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); +} + +static void +svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) +{ + uint64_t val; + int err; + + svm_get_cr0(svm_sc, vcpu, &val); + err = svm_setreg(svm_sc, vcpu, reg, val); + ASSERT(err == 0); +} + +static void +svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) +{ + struct vmcb_state *state; + uint64_t val; + int err; + + state = svm_get_vmcb_state(svm_sc, vcpu); + + err = svm_getreg(svm_sc, vcpu, reg, &val); + ASSERT(err == 0); + + if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) { + /* NW without CD is nonsensical */ + vm_inject_gp(svm_sc->vm, vcpu); + return; + } + if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) { + /* PG requires PE */ + vm_inject_gp(svm_sc->vm, vcpu); + return; + } + if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) { + /* When enabling paging, PAE must be enabled if LME is. */ + if ((state->efer & EFER_LME) != 0 && + (state->cr4 & CR4_PAE) == 0) { + vm_inject_gp(svm_sc->vm, vcpu); + return; + } + } + + svm_set_cr0(svm_sc, vcpu, val, true); +} + +static void +svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vie *vie; + struct vm_guest_paging paging; + + /* Let the instruction emulation (hopefully in-kernel) handle it */ + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul)); + vie = vm_vie_ctx(svm_sc->vm, vcpu); + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); + vie_init_other(vie, &paging); + + /* The instruction emulation will handle advancing %rip */ + vmexit->inst_length = 0; +} + +static void +svm_update_virqinfo(struct svm_softc *sc, int vcpu) +{ + struct vm *vm; + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + + vm = sc->vm; + vlapic = vm_lapic(vm, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* Virtual interrupt injection is not used. */ + KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); +} + +static void +svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", + intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); +} + +static __inline int +vintr_intercept_enabled(struct svm_softc *sc, int vcpu) +{ + + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_VINTR)); +} + +static void +svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_intr_prio & V_IGN_TPR, + ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + /* + * We use V_IRQ in conjunction with the VINTR intercept to trap into the + * hypervisor as soon as a virtual interrupt can be delivered. + * + * Since injected events are not subject to intercept checks we need to + * ensure that the V_IRQ is not actually going to be delivered on VM + * entry. + */ + VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow); + + VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); + ctrl->v_irq |= V_IRQ; + ctrl->v_intr_prio |= V_IGN_TPR; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static void +svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); + ctrl->v_irq &= ~V_IRQ; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +svm_nmi_blocked(struct svm_softc *sc, int vcpu) +{ + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_IRET)); +} + +static void +svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set an interrupt shadow to prevent an NMI from being immediately + * injected on the next VMRUN. + */ + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + ctrl->intr_shadow = 1; +} + +static void +svm_inject_event(struct svm_softc *sc, int vcpu, uint64_t intinfo) +{ + struct vmcb_ctrl *ctrl; + uint8_t vector; + uint32_t evtype; + + ASSERT(VMCB_EXITINTINFO_VALID(intinfo)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vector = VMCB_EXITINTINFO_VECTOR(intinfo); + evtype = VMCB_EXITINTINFO_TYPE(intinfo); + + switch (evtype) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + VERIFY(vector <= 31); + /* + * NMIs are expected to be injected with VMCB_EVENTINJ_TYPE_NMI, + * rather than as an exception with the NMI vector. + */ + VERIFY(vector != 2); + break; + default: + panic("unexpected event type %x", evtype); + } + + ctrl->eventinj = VMCB_EVENTINJ_VALID | evtype | vector; + if (VMCB_EXITINTINFO_EC_VALID(intinfo)) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)VMCB_EXITINTINFO_EC(intinfo) << 32; + } +} + +static void +svm_inject_nmi(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + ASSERT(!svm_nmi_blocked(sc, vcpu)); + + ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI; + vm_nmi_clear(sc->vm, vcpu); + + /* + * Virtual NMI blocking is now in effect. + * + * Not only does this block a subsequent NMI injection from taking + * place, it also configures an intercept on the IRET so we can track + * when the next injection can take place. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +svm_inject_irq(struct svm_softc *sc, int vcpu, int vector) +{ + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + ASSERT(vector >= 0 && vector <= 255); + + ctrl->eventinj = VMCB_EVENTINJ_VALID | vector; +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static vm_msr_result_t +svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval) +{ + struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu); + uint64_t lma; + int error; + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + + if (newval & EFER_MBZ_BITS) { + return (VMR_GP); + } + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + const uint64_t changed = state->efer ^ newval; + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) { + return (VMR_GP); + } + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) { + lma = EFER_LMA; + } else { + lma = 0; + } + if ((newval & EFER_LMA) != lma) { + return (VMR_GP); + } + + if ((newval & EFER_NXE) != 0 && + !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) { + return (VMR_GP); + } + if ((newval & EFER_FFXSR) != 0 && + !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) { + return (VMR_GP); + } + if ((newval & EFER_TCE) != 0 && + !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) { + return (VMR_GP); + } + + /* + * Until bhyve has proper support for long-mode segment limits, just + * toss a #GP at the guest if they attempt to use it. + */ + if (newval & EFER_LMSLE) { + return (VMR_GP); + } + + error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); + VERIFY0(error); + return (VMR_OK); +} + +static int +svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, + bool is_wrmsr) +{ + struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu); + struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu); + const uint32_t ecx = ctx->sctx_rcx; + vm_msr_result_t res; + uint64_t val = 0; + + if (is_wrmsr) { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); + val = ctx->sctx_rdx << 32 | (uint32_t)state->rax; + + if (vlapic_owned_msr(ecx)) { + struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); + + res = vlapic_wrmsr(vlapic, ecx, val); + } else if (ecx == MSR_EFER) { + res = svm_write_efer(svm_sc, vcpu, val); + } else { + res = svm_wrmsr(svm_sc, vcpu, ecx, val); + } + } else { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); + + if (vlapic_owned_msr(ecx)) { + struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); + + res = vlapic_rdmsr(vlapic, ecx, &val); + } else { + res = svm_rdmsr(svm_sc, vcpu, ecx, &val); + } + } + + switch (res) { + case VMR_OK: + /* Store rdmsr result in the appropriate registers */ + if (!is_wrmsr) { + state->rax = (uint32_t)val; + ctx->sctx_rdx = val >> 32; + } + return (1); + case VMR_GP: + vm_inject_gp(svm_sc->vm, vcpu); + return (1); + case VMR_UNHANLDED: + vmexit->exitcode = is_wrmsr ? + VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + return (0); + default: + panic("unexpected msr result %u\n", res); + } +} + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2; + int error, errcode_valid = 0, handled, idtvec, reflect; + + ctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb = svm_get_vmcb(svm_sc, vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(svm_sc, vcpu); + svm_save_exitintinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_CR0_READ: + if (VMCB_CRx_INFO1_VALID(info1) != 0) { + svm_handle_cr0_read(svm_sc, vcpu, + vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); + handled = 1; + } else { + /* + * If SMSW is used to read the contents of %cr0, then + * the VALID bit will not be set in `info1`, since the + * handling is different from the mov-to-reg case. + * + * Punt to the instruction emulation to handle it. + */ + svm_inst_emul_other(svm_sc, vcpu, vmexit); + } + break; + case VMCB_EXIT_CR0_WRITE: + case VMCB_EXIT_CR0_SEL_WRITE: + if (VMCB_CRx_INFO1_VALID(info1) != 0) { + svm_handle_cr0_write(svm_sc, vcpu, + vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); + handled = 1; + } else { + /* + * Writes to %cr0 without VALID being set in `info1` are + * initiated by the LMSW and CLTS instructions. While + * LMSW (like SMSW) sees little use in modern OSes and + * bootloaders, CLTS is still used for handling FPU + * state transitions. + * + * Punt to the instruction emulation to handle them. + */ + svm_inst_emul_other(svm_sc, vcpu, vmexit); + } + break; + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + svm_clear_nmi_blocking(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); + svm_disable_intr_window_exiting(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: + case VMCB_EXIT_SMI: + case VMCB_EXIT_INIT: + /* + * For external NMI/SMI and physical INIT interrupts, simply + * continue execution, as those host events will be handled by + * the physical CPU. + */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); + vmm_call_trap(T_MCE); + break; + case IDT_PF: + error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, + info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + + case IDT_BP: + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + + if (reflect) { + /* Reflect the exception back into the guest */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " + "%d/%x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + } + handled = 1; + break; + case VMCB_EXIT_MSR: + handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0); + break; + case VMCB_EXIT_IO: + handled = svm_handle_inout(svm_sc, vcpu, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_SHUTDOWN: + vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT); + handled = 1; + break; + case VMCB_EXIT_INVD: + case VMCB_EXIT_INVLPGA: + /* privileged invalidation instructions */ + vm_inject_ud(svm_sc->vm, vcpu); + handled = 1; + break; + case VMCB_EXIT_VMRUN: + case VMCB_EXIT_VMLOAD: + case VMCB_EXIT_VMSAVE: + case VMCB_EXIT_STGI: + case VMCB_EXIT_CLGI: + case VMCB_EXIT_SKINIT: + /* privileged vmm instructions */ + vm_inject_ud(svm_sc->vm, vcpu); + handled = 1; + break; + case VMCB_EXIT_VMMCALL: + /* No handlers make use of VMMCALL for now */ + vm_inject_ud(svm_sc->vm, vcpu); + handled = 1; + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax, + &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " + "reserved bits set: info1(%lx) info2(%lx)", + info1, info2); + } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " + "on gpa %lx/%lx at rip %lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "mmio_emul fault " + "for gpa %lx/%lx at rip %lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t, + code); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +/* + * Inject exceptions, NMIs, and ExtINTs. + * + * The logic behind these are complicated and may involve mutex contention, so + * the injection is performed without the protection of host CPU interrupts + * being disabled. This means a racing notification could be "lost", + * necessitating a later call to svm_inject_recheck() to close that window + * of opportunity. + */ +static enum event_inject_state +svm_inject_events(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_vcpu *vcpustate; + uint64_t intinfo; + enum event_inject_state ev_state; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); + ev_state = EIS_CAN_INJECT; + + /* Clear any interrupt shadow if guest %rip has changed */ + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + } + + /* + * An event is already pending for injection. This can occur when the + * vCPU exits prior to VM entry (like for an AST). + */ + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + return (EIS_EV_EXISTING | EIS_REQ_EXIT); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) { + ASSERT(VMCB_EXITINTINFO_VALID(intinfo)); + + svm_inject_event(sc, vcpu, intinfo); + vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); + ev_state = EIS_EV_INJECTED; + } + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) { + if (ev_state == EIS_CAN_INJECT) { + /* Can't inject NMI if vcpu is in an intr_shadow. */ + if (ctrl->intr_shadow) { + return (EIS_GI_BLOCK); + } + + svm_inject_nmi(sc, vcpu); + ev_state = EIS_EV_INJECTED; + } else { + return (ev_state | EIS_REQ_EXIT); + } + } + + if (vm_extint_pending(sc->vm, vcpu)) { + int vector; + + if (ev_state != EIS_CAN_INJECT) { + return (ev_state | EIS_REQ_EXIT); + } + + /* + * If the guest has disabled interrupts or is in an interrupt + * shadow then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { + return (EIS_GI_BLOCK); + } + + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + + svm_inject_irq(sc, vcpu, vector); + vm_extint_clear(sc->vm, vcpu); + vatpic_intr_accepted(sc->vm, vector); + ev_state = EIS_EV_INJECTED; + } + + return (ev_state); +} + +/* + * Synchronize vLAPIC state and inject any interrupts pending on it. + * + * This is done with host CPU interrupts disabled so notification IPIs will be + * queued on the host APIC and recognized when entering SVM guest context. + */ +static enum event_inject_state +svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic, + enum event_inject_state ev_state) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + int vector; + uint8_t v_tpr; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* + * The guest can modify the TPR by writing to %cr8. In guest mode the + * CPU reflects this write to V_TPR without hypervisor intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + ctrl->v_tpr = v_tpr; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } + + /* If an event cannot otherwise be injected, we are done for now */ + if (ev_state != EIS_CAN_INJECT) { + return (ev_state); + } + + if (!vlapic_pending_intr(vlapic, &vector)) { + return (EIS_CAN_INJECT); + } + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { + return (EIS_GI_BLOCK); + } + + svm_inject_irq(sc, vcpu, vector); + vlapic_intr_accepted(vlapic, vector); + return (EIS_EV_INJECTED); +} + +/* + * Re-check for events to be injected. + * + * Once host CPU interrupts are disabled, check for the presence of any events + * which require injection processing. If an exit is required upon injection, + * or once the guest becomes interruptable, that will be configured too. + */ +static bool +svm_inject_recheck(struct svm_softc *sc, int vcpu, + enum event_inject_state ev_state) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (ev_state == EIS_CAN_INJECT) { + /* + * An active interrupt shadow would preclude us from injecting + * any events picked up during a re-check. + */ + if (ctrl->intr_shadow != 0) { + return (false); + } + + if (vm_nmi_pending(sc->vm, vcpu) && + !svm_nmi_blocked(sc, vcpu)) { + /* queued NMI not blocked by NMI-window-exiting */ + return (true); + } + if (vm_extint_pending(sc->vm, vcpu)) { + /* queued ExtINT not blocked by existing injection */ + return (true); + } + } else { + if ((ev_state & EIS_REQ_EXIT) != 0) { + /* + * Use a self-IPI to force an immediate exit after + * event injection has occurred. + */ + poke_cpu(CPU->cpu_id); + } else { + /* + * If any event is being injected, an exit immediately + * upon becoming interruptable again will allow pending + * or newly queued events to be injected in a timely + * manner. + */ + svm_enable_intr_window_exiting(sc, vcpu); + } + } + return (false); +} + + +static void +check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + uint8_t flush; + + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + vcpustate->nptgen != nptgen); + + if (flush != VMCB_TLB_FLUSH_NOTHING) { + ctrl->asid = vcpustate->hma_asid.hsa_asid; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + } + ctrl->tlb_ctrl = flush; + vcpustate->nptgen = nptgen; +} + +static void +flush_asid(struct svm_softc *sc, int vcpuid) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + uint8_t flush; + + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + true); + + ASSERT(flush != VMCB_TLB_FLUSH_NOTHING); + ctrl->asid = vcpustate->hma_asid.hsa_asid; + ctrl->tlb_ctrl = flush; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + /* + * A potential future optimization: We could choose to update the nptgen + * associated with the vCPU, since any pending nptgen change requiring a + * flush will be satisfied by the one which has just now been queued. + */ +} + +static __inline void +disable_gintr(void) +{ + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + __asm __volatile("stgi"); +} + +static __inline void +svm_dr_enter_guest(struct svm_regctx *gctx) +{ + + /* Save host control debug registers. */ + gctx->host_dr7 = rdr7(); + gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR6, DR7, and DEBUGCTL are saved/restored in the + * VMCB. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* Save host debug registers. */ + gctx->host_dr0 = rdr0(); + gctx->host_dr1 = rdr1(); + gctx->host_dr2 = rdr2(); + gctx->host_dr3 = rdr3(); + gctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(gctx->sctx_dr0); + load_dr1(gctx->sctx_dr1); + load_dr2(gctx->sctx_dr2); + load_dr3(gctx->sctx_dr3); +} + +static __inline void +svm_dr_leave_guest(struct svm_regctx *gctx) +{ + + /* Save guest debug registers. */ + gctx->sctx_dr0 = rdr0(); + gctx->sctx_dr1 = rdr1(); + gctx->sctx_dr2 = rdr2(); + gctx->sctx_dr3 = rdr3(); + + /* + * Restore host debug registers. Restore DR7 and DEBUGCTL + * last. + */ + load_dr0(gctx->host_dr0); + load_dr1(gctx->host_dr1); + load_dr2(gctx->host_dr2); + load_dr3(gctx->host_dr3); + load_dr6(gctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); + load_dr7(gctx->host_dr7); +} + +static void +svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid) +{ + const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid); + + if (ctrl->tsc_offset != offset) { + ctrl->tsc_offset = offset; + svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I); + } +} + + +/* + * Start vcpu with specified RIP. + */ +static int +svm_vmrun(void *arg, int vcpu, uint64_t rip) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + vm_client_t *vmc; + struct vm *vm; + uint64_t vmcb_pa; + int handled; + uint16_t ldt_sel; + + svm_sc = arg; + vm = svm_sc->vm; + + vcpustate = svm_get_vcpu(svm_sc, vcpu); + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + vmc = vm_get_vmclient(vm, vcpu); + + gctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; + + if (vcpustate->lastcpu != curcpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ + vcpustate->hma_asid.hsa_gen = 0; + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(svm_sc, vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpustate->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if an AST or yield + * condition is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpustate->lastcpu = curcpu; + vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); + } + + svm_apply_tsc_adjust(svm_sc, vcpu); + + svm_msr_guest_enter(svm_sc, vcpu); + + VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_TRUE; + + /* Update Guest RIP */ + state->rip = rip; + + do { + enum event_inject_state inject_state; + uint64_t nptgen; + + /* + * Initial event injection is complex and may involve mutex + * contention, so it must be performed with global interrupts + * still enabled. + */ + inject_state = svm_inject_events(svm_sc, vcpu); + handled = 0; + + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + /* + * Synchronizing and injecting vlapic state is lock-free and is + * safe (and prudent) to perform with interrupts disabled. + */ + inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic, + inject_state); + + /* + * Check for vCPU bail-out conditions. This must be done after + * svm_inject_events() to detect a triple-fault condition. + */ + if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) { + enable_gintr(); + break; + } + + if (vcpu_run_state_pending(vm, vcpu)) { + enable_gintr(); + vm_exit_run_state(vm, vcpu, state->rip); + break; + } + + /* + * If subsequent activity queued events which require injection + * handling, take another lap to handle them. + */ + if (svm_inject_recheck(svm_sc, vcpu, inject_state)) { + enable_gintr(); + handled = 1; + continue; + } + + /* + * #VMEXIT resumes the host with the guest LDTR, so + * save the current LDT selector so it can be restored + * after an exit. The userspace hypervisor probably + * doesn't use a LDT, but save and restore it to be + * safe. + */ + ldt_sel = sldt(); + + /* + * Check the vmspace and ASID generations to ensure that the + * vcpu does not use stale TLB mappings. + */ + nptgen = vmc_table_enter(vmc); + check_asid(svm_sc, vcpu, curcpu, nptgen); + + ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; + vcpustate->dirty = 0; + VCPU_CTR1(vm, vcpu, "vmcb clean %x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + vcpu_ustate_change(vm, vcpu, VU_RUN); + VCPU_CTR1(vm, vcpu, "Resume execution at %lx", state->rip); + svm_dr_enter_guest(gctx); + svm_launch(vmcb_pa, gctx, get_pcpu()); + svm_dr_leave_guest(gctx); + vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); + + /* Restore host LDTR. */ + lldt(ldt_sel); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + vmc_table_exit(vmc); + + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(svm_sc, vcpu); + + VERIFY(vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_FALSE; + + return (0); +} + +static void +svm_vmcleanup(void *arg) +{ + struct svm_softc *sc = arg; + + contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); + contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); + free(sc, M_SVM); +} + +static uint64_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + case VM_REG_GUEST_DR0: + return (®ctx->sctx_dr0); + case VM_REG_GUEST_DR1: + return (®ctx->sctx_dr1); + case VM_REG_GUEST_DR2: + return (®ctx->sctx_dr2); + case VM_REG_GUEST_DR3: + return (®ctx->sctx_dr3); + default: + return (NULL); + } +} + +static int +svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) +{ + struct svm_softc *sc; + struct vmcb *vmcb; + uint64_t *regp; + uint64_t *fieldp; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); + if (regp != NULL) { + *val = *regp; + return (0); + } + + switch (ident) { + case VM_REG_GUEST_INTR_SHADOW: + *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0; + break; + + case VM_REG_GUEST_CR0: + svm_get_cr0(sc, vcpu, val); + break; + case VM_REG_GUEST_CR2: + case VM_REG_GUEST_CR3: + case VM_REG_GUEST_CR4: + case VM_REG_GUEST_DR6: + case VM_REG_GUEST_DR7: + case VM_REG_GUEST_EFER: + case VM_REG_GUEST_RAX: + case VM_REG_GUEST_RFLAGS: + case VM_REG_GUEST_RIP: + case VM_REG_GUEST_RSP: + fieldp = vmcb_regptr(vmcb, ident, NULL); + *val = *fieldp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + *val = seg->selector; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + return (EINVAL); + + default: + return (EINVAL); + } + + return (0); +} + +static int +svm_setreg(void *arg, int vcpu, int ident, uint64_t val) +{ + struct svm_softc *sc; + struct vmcb *vmcb; + uint64_t *regp; + uint64_t *fieldp; + uint32_t dirty; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); + if (regp != NULL) { + *regp = val; + return (0); + } + + dirty = VMCB_CACHE_NONE; + switch (ident) { + case VM_REG_GUEST_INTR_SHADOW: + vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0; + break; + + case VM_REG_GUEST_EFER: + fieldp = vmcb_regptr(vmcb, ident, &dirty); + /* EFER_SVM must always be set when the guest is executing */ + *fieldp = val | EFER_SVM; + dirty |= VMCB_CACHE_CR; + break; + + case VM_REG_GUEST_CR0: + svm_set_cr0(sc, vcpu, val, false); + break; + case VM_REG_GUEST_CR2: + case VM_REG_GUEST_CR3: + case VM_REG_GUEST_CR4: + case VM_REG_GUEST_DR6: + case VM_REG_GUEST_DR7: + case VM_REG_GUEST_RAX: + case VM_REG_GUEST_RFLAGS: + case VM_REG_GUEST_RIP: + case VM_REG_GUEST_RSP: + fieldp = vmcb_regptr(vmcb, ident, &dirty); + *fieldp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + dirty |= VMCB_CACHE_SEG; + seg = vmcb_segptr(vmcb, ident); + seg->selector = (uint16_t)val; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + return (EINVAL); + + default: + return (EINVAL); + } + + if (dirty != VMCB_CACHE_NONE) { + svm_set_dirty(sc, vcpu, dirty); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + return (0); +} + +static int +svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + seg = vmcb_segptr(vmcb, reg); + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + seg->attrib = VMCB_ACCESS2ATTR(desc->access); + if (SEG_DESC_UNUSABLE(desc->access)) { + seg->attrib &= ~0x80; + } + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); + seg = vmcb_segptr(vmcb, reg); + break; + + default: + return (EINVAL); + } + + ASSERT(seg != NULL); + seg->base = desc->base; + seg->limit = desc->limit; + + return (0); +} + +static int +svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + switch (reg) { + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + seg = vmcb_segptr(vmcb, reg); + desc->access = VMCB_ATTR2ACCESS(seg->attrib); + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if ((desc->access & 0x80) == 0) { + /* Unusable segment */ + desc->access |= 0x10000; + } + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, reg); + desc->access = VMCB_ATTR2ACCESS(seg->attrib); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + seg = vmcb_segptr(vmcb, reg); + /* + * Since there are no access bits associated with the GDTR or + * the IDTR, zero out the field to ensure it does not contain + * garbage which might confuse the consumer. + */ + desc->access = 0; + break; + + default: + return (EINVAL); + } + + ASSERT(seg != NULL); + desc->base = seg->base; + desc->limit = seg->limit; + return (0); +} + +static int +svm_setcap(void *arg, int vcpu, int type, int val) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vlapic * +svm_vlapic_init(void *arg, int vcpuid) +{ + struct svm_softc *svm_sc; + struct vlapic *vlapic; + + svm_sc = arg; + vlapic = malloc(sizeof (struct vlapic), M_SVM_VLAPIC, + M_WAITOK | M_ZERO); + vlapic->vm = svm_sc->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + vlapic_cleanup(vlapic); + free(vlapic, M_SVM_VLAPIC); +} + +static void +svm_savectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_exit(sc, vcpu); + } +} + +static void +svm_restorectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_enter(sc, vcpu); + } +} + +struct vmm_ops vmm_ops_amd = { + .init = svm_init, + .cleanup = svm_cleanup, + .resume = svm_restore, + + .vminit = svm_vminit, + .vmrun = svm_vmrun, + .vmcleanup = svm_vmcleanup, + .vmgetreg = svm_getreg, + .vmsetreg = svm_setreg, + .vmgetdesc = svm_getdesc, + .vmsetdesc = svm_setdesc, + .vmgetcap = svm_getcap, + .vmsetcap = svm_setcap, + .vlapic_init = svm_vlapic_init, + .vlapic_cleanup = svm_vlapic_cleanup, + + .vmsavectx = svm_savectx, + .vmrestorectx = svm_restorectx, +}; diff --git a/usr/src/uts/intel/io/vmm/amd/svm.h b/usr/src/uts/intel/io/vmm/amd/svm.h new file mode 100644 index 0000000000..91e8419789 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm.h @@ -0,0 +1,70 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + uint64_t sctx_rbp; + uint64_t sctx_rbx; + uint64_t sctx_rcx; + uint64_t sctx_rdx; + uint64_t sctx_rdi; + uint64_t sctx_rsi; + uint64_t sctx_r8; + uint64_t sctx_r9; + uint64_t sctx_r10; + uint64_t sctx_r11; + uint64_t sctx_r12; + uint64_t sctx_r13; + uint64_t sctx_r14; + uint64_t sctx_r15; + uint64_t sctx_dr0; + uint64_t sctx_dr1; + uint64_t sctx_dr2; + uint64_t sctx_dr3; + uint64_t sctx_cr0_shadow; + + uint64_t host_dr0; + uint64_t host_dr1; + uint64_t host_dr2; + uint64_t host_dr3; + uint64_t host_dr6; + uint64_t host_dr7; + uint64_t host_debugctl; +}; + +struct cpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *pcpu); + +#endif /* _SVM_H_ */ diff --git a/usr/src/uts/intel/io/vmm/amd/svm_msr.c b/usr/src/uts/intel/io/vmm/amd/svm_msr.c new file mode 100644 index 0000000000..4fa7826fbf --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm_msr.c @@ -0,0 +1,179 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/x86_archext.h> +#include <sys/privregs.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <sys/vmm_kernel.h> + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; +CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM); + +void +svm_msr_init(void) +{ + /* + * These MSRs do vary between CPUs on illumos, so saving system-wide + * values for them serves no purpose. + */ +} + +void +svm_msr_guest_init(struct svm_softc *sc, int vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ +} + +void +svm_msr_guest_enter(struct svm_softc *sc, int vcpu) +{ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} + +void +svm_msr_guest_exit(struct svm_softc *sc, int vcpu) +{ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +vm_msr_result_t +svm_rdmsr(struct svm_softc *sc, int vcpu, uint32_t num, uint64_t *result) +{ + switch (num) { + case MSR_SYSCFG: + case MSR_AMDK8_IPM: + case MSR_EXTFEATURES: + *result = 0; + break; + case MSR_AMD_DE_CFG: + *result = 0; + /* + * Bit 1 of DE_CFG is defined by AMD to control whether the + * lfence instruction is serializing. Practically all CPUs + * supported by bhyve also contain this MSR, making it safe to + * expose unconditionally. + */ + if (is_x86_feature(x86_featureset, X86FSET_LFENCE_SER)) { + *result |= AMD_DE_CFG_LFENCE_DISPATCH; + } + break; + default: + return (VMR_UNHANLDED); + } + return (VMR_OK); +} + +vm_msr_result_t +svm_wrmsr(struct svm_softc *sc, int vcpu, uint32_t num, uint64_t val) +{ + switch (num) { + case MSR_SYSCFG: + /* Ignore writes */ + break; + case MSR_AMD_DE_CFG: + /* Ignore writes */ + break; + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + case MSR_K8_UCODE_UPDATE: + /* + * Ignore writes to microcode update register. + */ + break; + case MSR_EXTFEATURES: + break; + default: + return (VMR_UNHANLDED); + } + + return (VMR_OK); +} diff --git a/usr/src/uts/intel/io/vmm/amd/svm_msr.h b/usr/src/uts/intel/io/vmm/amd/svm_msr.h new file mode 100644 index 0000000000..8f0d14e6b9 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm_msr.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, int vcpu); +void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); +void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); + +vm_msr_result_t svm_wrmsr(struct svm_softc *, int, uint32_t, uint64_t); +vm_msr_result_t svm_rdmsr(struct svm_softc *, int, uint32_t, uint64_t *); + +#endif /* _SVM_MSR_H_ */ diff --git a/usr/src/uts/intel/io/vmm/amd/svm_softc.h b/usr/src/uts/intel/io/vmm/amd/svm_softc.h new file mode 100644 index 0000000000..adf9bb8ddd --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm_softc.h @@ -0,0 +1,117 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +#include <sys/hma.h> + +/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */ +#define SVM_HOST_MSR_NUM 4 + +/* + * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space + * due to VMCB alignment requirements. + */ +struct svm_vcpu { + struct vmcb vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + uint64_t nptgen; /* page table gen when the vcpu last ran */ + hma_svm_asid_t hma_asid; + boolean_t loaded; +} __aligned(PAGE_SIZE); + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; + struct svm_vcpu vcpu[VM_MAXCPU]; + uint64_t nptp; /* nested page table (host PA) */ + uint8_t *iopm_bitmap; /* shared by all vcpus */ + uint8_t *msr_bitmap; /* shared by all vcpus */ + struct vm *vm; + uint64_t host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM]; +}; + +CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); + +static __inline struct svm_vcpu * +svm_get_vcpu(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu])); +} + +static __inline struct vmcb * +svm_get_vmcb(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb)); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.state)); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.ctrl)); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].swctx)); +} + +static __inline void +svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) +{ + struct svm_vcpu *vcpustate; + + vcpustate = svm_get_vcpu(sc, vcpu); + vcpustate->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/usr/src/uts/intel/io/vmm/amd/svm_support.s b/usr/src/uts/intel/io/vmm/amd/svm_support.s new file mode 100644 index 0000000000..278dd5c5cb --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/svm_support.s @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/asm_linkage.h> + +#include "svm_assym.h" + +/* Porting note: This is named 'svm_support.S' upstream. */ + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define SVM_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + +/* Stack layout (offset from %rsp) for svm_launch */ +#define SVMSTK_R15 0x00 /* callee saved %r15 */ +#define SVMSTK_R14 0x08 /* callee saved %r14 */ +#define SVMSTK_R13 0x10 /* callee saved %r13 */ +#define SVMSTK_R12 0x18 /* callee saved %r12 */ +#define SVMSTK_RBX 0x20 /* callee saved %rbx */ +#define SVMSTK_RDX 0x28 /* save-args %rdx (struct cpu *) */ +#define SVMSTK_RSI 0x30 /* save-args %rsi (struct svm_regctx *) */ +#define SVMSTK_RDI 0x38 /* save-args %rdi (uint64_t vmcb_pa) */ +#define SVMSTK_FP 0x40 /* frame pointer %rbp */ +#define SVMSTKSIZE SVMSTK_FP + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + * %rdx: pointer to the pcpu data + */ +ENTRY_NP(svm_launch) + pushq %rbp + movq %rsp, %rbp + subq $SVMSTKSIZE, %rsp + movq %r15, SVMSTK_R15(%rsp) + movq %r14, SVMSTK_R14(%rsp) + movq %r13, SVMSTK_R13(%rsp) + movq %r12, SVMSTK_R12(%rsp) + movq %rbx, SVMSTK_RBX(%rsp) + movq %rdx, SVMSTK_RDX(%rsp) + movq %rsi, SVMSTK_RSI(%rsp) + movq %rdi, SVMSTK_RDI(%rsp) + + /* Save the physical address of the VMCB in %rax */ + movq %rdi, %rax + + /* Restore guest state. */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + vmload %rax + vmrun %rax + vmsave %rax + + /* Grab the svm_regctx pointer */ + movq SVMSTK_RSI(%rsp), %rax + + /* Save guest state. */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore callee-saved registers */ + movq SVMSTK_R15(%rsp), %r15 + movq SVMSTK_R14(%rsp), %r14 + movq SVMSTK_R13(%rsp), %r13 + movq SVMSTK_R12(%rsp), %r12 + movq SVMSTK_RBX(%rsp), %rbx + + /* Fix %gsbase to point back to the correct 'struct cpu *' */ + movq SVMSTK_RDX(%rsp), %rdx + movl %edx, %eax + shrq $32, %rdx + movl $MSR_GSBASE, %ecx + wrmsr + + /* + * While SVM will save/restore the GDTR and IDTR, the TR does not enjoy + * such treatment. Reload the KTSS immediately, since it is used by + * dtrace and other fault/trap handlers. + */ + movq SVMSTK_RDX(%rsp), %rdi /* %rdi = CPU */ + movq CPU_GDT(%rdi), %rdi /* %rdi = cpu->cpu_gdt */ + leaq GDT_KTSS_OFF(%rdi), %rdi /* %rdi = &cpu_gdt[GDT_KTSS] */ + andb $0xfd, SSD_TYPE(%rdi) /* ssd_type.busy = 0 */ + movw $KTSS_SEL, %ax /* reload kernel TSS */ + ltr %ax + + SVM_GUEST_FLUSH_SCRATCH + + addq $SVMSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(svm_launch) diff --git a/usr/src/uts/intel/io/vmm/amd/vmcb.c b/usr/src/uts/intel/io/vmm/amd/vmcb.c new file mode 100644 index 0000000000..5be5240129 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/vmcb.c @@ -0,0 +1,150 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vmcb.h" +#include "svm.h" + +struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + return (&state->cs); + case VM_REG_GUEST_DS: + return (&state->ds); + case VM_REG_GUEST_ES: + return (&state->es); + case VM_REG_GUEST_FS: + return (&state->fs); + case VM_REG_GUEST_GS: + return (&state->gs); + case VM_REG_GUEST_SS: + return (&state->ss); + case VM_REG_GUEST_GDTR: + return (&state->gdt); + case VM_REG_GUEST_IDTR: + return (&state->idt); + case VM_REG_GUEST_LDTR: + return (&state->ldt); + case VM_REG_GUEST_TR: + return (&state->tr); + default: + panic("unexpected seg %d", type); + } +} + +uint64_t * +vmcb_regptr(struct vmcb *vmcb, int ident, uint32_t *dirtyp) +{ + struct vmcb_state *state; + uint64_t *res = NULL; + uint32_t dirty = VMCB_CACHE_NONE; + + state = &vmcb->state; + + switch (ident) { + case VM_REG_GUEST_CR2: + res = &state->cr2; + dirty = VMCB_CACHE_CR2; + break; + + case VM_REG_GUEST_CR3: + res = &state->cr3; + dirty = VMCB_CACHE_CR; + break; + + case VM_REG_GUEST_CR4: + res = &state->cr4; + dirty = VMCB_CACHE_CR; + break; + + case VM_REG_GUEST_DR6: + res = &state->dr6; + dirty = VMCB_CACHE_DR; + break; + + case VM_REG_GUEST_DR7: + res = &state->dr7; + dirty = VMCB_CACHE_DR; + break; + + case VM_REG_GUEST_EFER: + res = &state->efer; + dirty = VMCB_CACHE_CR; + break; + + case VM_REG_GUEST_RAX: + res = &state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + res = &state->rflags; + break; + + case VM_REG_GUEST_RIP: + res = &state->rip; + break; + + case VM_REG_GUEST_RSP: + res = &state->rsp; + break; + + default: + panic("unexpected register %d", ident); + break; + } + + ASSERT(res != NULL); + if (dirtyp != NULL) { + *dirtyp |= dirty; + } + return (res); +} diff --git a/usr/src/uts/intel/io/vmm/amd/vmcb.h b/usr/src/uts/intel/io/vmm/amd/vmcb.h new file mode 100644 index 0000000000..15b076b5bb --- /dev/null +++ b/usr/src/uts/intel/io/vmm/amd/vmcb.h @@ -0,0 +1,401 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +struct svm_softc; + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVLPG BIT(25) +#define VMCB_INTCPT_INVLPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI (2 << 8) +#define VMCB_EVENTINJ_TYPE_EXCEPTION (3 << 8) +#define VMCB_EVENTINJ_TYPE_INTn (4 << 8) + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_CR0_READ 0x00 +#define VMCB_EXIT_CR15_READ 0x0f +#define VMCB_EXIT_CR0_WRITE 0x10 +#define VMCB_EXIT_CR15_WRITE 0x1f +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_SMI 0x62 +#define VMCB_EXIT_INIT 0x63 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_CR0_SEL_WRITE 0x65 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_INVD 0x76 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_INVLPG 0x79 +#define VMCB_EXIT_INVLPGA 0x7A +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMRUN 0x80 +#define VMCB_EXIT_VMMCALL 0x81 +#define VMCB_EXIT_VMLOAD 0x82 +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_STGI 0x84 +#define VMCB_EXIT_CLGI 0x85 +#define VMCB_EXIT_SKINIT 0x86 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Move to/from CRx + * Bit definitions to decode EXITINFO1 + */ +#define VMCB_CRx_INFO1_GPR(x) ((x) & 0xf) +#define VMCB_CRx_INFO1_VALID(x) ((x) & (1UL << 63)) + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intrecepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) ((x) & (0x7 << 8)) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) != 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) != 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) + +#ifdef _KERNEL +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +}; +CTASSERT(sizeof (struct vmcb_segment) == 16); + +/* Convert to/from vmcb segment access to generic (VMX) access */ +#define VMCB_ATTR2ACCESS(attr) ((((attr) & 0xf00) << 4) | ((attr) & 0xff)) +#define VMCB_ACCESS2ATTR(acc) ((((acc) & 0xf000) >> 4) | ((acc) & 0xff)) + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* Fields for Virtual Interrupt Control (v_irq) */ +#define V_IRQ BIT(0) /* Offset 0x60 bit 8 (0x61 bit 0) */ +#define V_VGIF_VALUE BIT(1) /* Offset 0x60 bit 9 (0x61 bit 1) */ + +/* Fields for Virtual Interrupt Control (v_intr_prio) */ +#define V_INTR_PRIO 0xf /* Offset 0x60 bits 16-19 (0x62 bits 0-3) */ +#define V_IGN_TPR BIT(4) /* Offset 0x60 bit 20 (0x62 bit 4) */ + +/* Fields for Virtual Interrupt Control (v_intr_ctrl) */ +#define V_INTR_MASKING BIT(0) /* Offset 0x60 bit 24 (0x63 bit 0) */ +#define V_VGIF_ENABLE BIT(1) /* Offset 0x60 bit 25 (0x63 bit 1) */ +#define V_AVIC_ENABLE BIT(7) /* Offset 0x60 bit 31 (0x63 bit 7) */ + +/* Fields in Interrupt Shadow, offset 0x68 */ +#define VIRTUAL_INTR_SHADOW BIT(0) +#define GUEST_INTERRUPT_MASK BIT(1) + +/* Fields in Nested Paging, offset 0x90 */ +#define NP_ENABLE BIT(0) /* Enable nested paging */ +#define SEV_ENABLE BIT(1) /* Enable SEV */ +#define SEV_ES_ENABLE BIT(2) /* Enable SEV-ES */ +#define GUEST_MODE_EXEC_TRAP BIT(3) /* Guest mode execute trap */ +#define VIRT_TRANSPAR_ENCRYPT BIT(5) /* Virtual transparent encryption */ + +/* Fields in Misc virt controls, offset 0xB8 */ +#define LBR_VIRT_ENABLE BIT(0) /* Enable LBR virtualization accel */ +#define VIRT_VMSAVE_VMLOAD BIT(1) /* Virtualized VMSAVE/VMLOAD */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* 0x00-0x13: all intercepts */ + uint32_t _pad1[10]; /* 0x14-0x3B: Reserved. */ + uint32_t pause_ctrl; /* 0x3C, PAUSE filter thresh/count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t _pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: Virtual TPR */ + uint8_t v_irq; /* 0x61: V_IRQ, V_GIF_VALUE + Reserved */ + uint8_t v_intr_prio; /* 0x62: V_INTR_PRIO, V_IGN_TPR */ + uint8_t v_intr_ctrl; /* 0x63: V_INTR_MASKING, vGIF and AVIC enable */ + uint8_t v_intr_vector; /* 0x64: Virtual interrupt vector */ + uint8_t _pad3[3]; /* 0x65-0x67: Reserved */ + uint64_t intr_shadow; /* 0x68: Interrupt shadow (and more) */ + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_ctrl; /* 0x90, Nested paging control. */ + uint64_t _pad4[2]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* 0xB0, Nested page table. */ + uint64_t misc_ctrl; /* 0xB8, Misc virt controls */ + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t _pad5; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; /* 0xD1-0xDF: guest instr bytes */ + uint64_t avic_page_pa; /* 0xEO: AVIC backing page */ + uint64_t _pad6; /* 0xE8-0xEF: Reserved */ + uint64_t avic_log_tbl; /* 0xFO: AVIC logical table */ + uint64_t avic_phys_tbl; /* 0xF8: AVIC physical page */ + uint64_t _pad7; /* 0x100-0x107: Reserved */ + uint64_t vmsa_pa; /* 0x108: VMSA pointer */ + uint64_t _pad8[94]; /* 0x110-0x3FF: Reserved */ +}; +CTASSERT(sizeof (struct vmcb_ctrl) == 1024); +CTASSERT(offsetof(struct vmcb_ctrl, vmsa_pa) == 0x108); + +struct vmcb_state { + struct vmcb_segment es; /* 0x00: 32bit base */ + struct vmcb_segment cs; /* 0x10: 32bit base */ + struct vmcb_segment ss; /* 0x20: 32bit base */ + struct vmcb_segment ds; /* 0x30: 32bit base */ + struct vmcb_segment fs; /* 0x40 */ + struct vmcb_segment gs; /* 0x50 */ + struct vmcb_segment gdt; /* 0x60: base + 16bit limit */ + struct vmcb_segment ldt; /* 0x70 */ + struct vmcb_segment idt; /* 0x80: base + 16bit limit */ + struct vmcb_segment tr; /* 0x90 */ + uint8_t _pad1[43]; /* 0xA0-0xCA: Reserved */ + uint8_t cpl; /* 0xCB: CPL (real mode: 0, virt: 3) */ + uint32_t _pad2; /* 0xCC-0xCF: Reserved */ + uint64_t efer; /* 0xD0 */ + uint64_t _pad3[14]; /* 0xD8-0x147: Reserved */ + uint64_t cr4; /* 0x148 */ + uint64_t cr3; /* 0x150 */ + uint64_t cr0; /* 0x158 */ + uint64_t dr7; /* 0x160 */ + uint64_t dr6; /* 0x168 */ + uint64_t rflags; /* 0x170 */ + uint64_t rip; /* 0x178 */ + uint64_t _pad4[11]; /* 0x180-0x1D7: Reserved */ + uint64_t rsp; /* 0x1D8 */ + uint64_t _pad5[3]; /* 0x1E0-0x1F7: Reserved */ + uint64_t rax; /* 0x1F8 */ + uint64_t star; /* 0x200 */ + uint64_t lstar; /* 0x208 */ + uint64_t cstar; /* 0x210 */ + uint64_t sfmask; /* 0x218 */ + uint64_t kernelgsbase; /* 0x220 */ + uint64_t sysenter_cs; /* 0x228 */ + uint64_t sysenter_esp; /* 0x230 */ + uint64_t sysenter_eip; /* 0x238 */ + uint64_t cr2; /* 0x240 */ + uint64_t _pad6[4]; /* 0x248-0x267: Reserved */ + uint64_t g_pat; /* 0x268 */ + uint64_t dbgctl; /* 0x270 */ + uint64_t br_from; /* 0x278 */ + uint64_t br_to; /* 0x280 */ + uint64_t int_from; /* 0x288 */ + uint64_t int_to; /* 0x290 */ + uint64_t _pad7[301]; /* Reserved up to end of VMCB */ +}; +CTASSERT(sizeof (struct vmcb_state) == 0xC00); +CTASSERT(offsetof(struct vmcb_state, int_to) == 0x290); + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +}; +CTASSERT(sizeof (struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +struct vmcb_segment *vmcb_segptr(struct vmcb *vmcb, int type); +uint64_t *vmcb_regptr(struct vmcb *vmcb, int ident, uint32_t *dirtyp); + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/usr/src/uts/intel/io/vmm/intel/offsets.in b/usr/src/uts/intel/io/vmm/intel/offsets.in new file mode 100644 index 0000000000..f467e7b1ca --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/offsets.in @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2014 Pluribus Networks Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> + +#include <machine/vmm.h> +#include <sys/vmm_vm.h> + +#include "intel/vmx.h" + +vmxctx + guest_rdi VMXCTX_GUEST_RDI + guest_rsi VMXCTX_GUEST_RSI + guest_rdx VMXCTX_GUEST_RDX + guest_rcx VMXCTX_GUEST_RCX + guest_r8 VMXCTX_GUEST_R8 + guest_r9 VMXCTX_GUEST_R9 + guest_rax VMXCTX_GUEST_RAX + guest_rbx VMXCTX_GUEST_RBX + guest_rbp VMXCTX_GUEST_RBP + guest_r10 VMXCTX_GUEST_R10 + guest_r11 VMXCTX_GUEST_R11 + guest_r12 VMXCTX_GUEST_R12 + guest_r13 VMXCTX_GUEST_R13 + guest_r14 VMXCTX_GUEST_R14 + guest_r15 VMXCTX_GUEST_R15 + guest_cr2 VMXCTX_GUEST_CR2 + inst_fail_status VMXCTX_INST_FAIL_STATUS + +\#define VM_SUCCESS 0 +\#define VM_FAIL_INVALID 1 +\#define VM_FAIL_VALID 2 + +\#define VMX_GUEST_VMEXIT 0 +\#define VMX_VMRESUME_ERROR 1 +\#define VMX_VMLAUNCH_ERROR 2 +\#define VMX_INVEPT_ERROR 3 +\#define VMX_VMWRITE_ERROR 4 diff --git a/usr/src/uts/intel/io/vmm/intel/vmcs.c b/usr/src/uts/intel/io/vmm/intel/vmcs.c new file mode 100644 index 0000000000..7fabba79f7 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmcs.c @@ -0,0 +1,271 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include "vmx.h" + +/* Bits 0-30 of VMX_BASIC MSR contain VMCS revision identifier */ +#define VMX_BASIC_REVISION(v) ((v) & 0x7fffffff) + +uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); + case VM_REG_GUEST_ENTRY_INST_LENGTH: + return (VMCS_ENTRY_INST_LENGTH); + default: + return (VMCS_INVALID_ENCODING); + } +} + +void +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + panic("invalid segment register %d", seg); + } +} + +void +vmcs_clear(uintptr_t vmcs_pa) +{ + int err; + + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (err) + : [addr] "m" (vmcs_pa) + : "memory"); + + if (err != 0) { + panic("vmclear(%p) error %d", (void *)vmcs_pa, err); + } + + /* + * A call to critical_enter() was made in vmcs_load() to prevent + * preemption. Now that the VMCS is unloaded, it is safe to relax that + * restriction. + */ + critical_exit(); +} + +void +vmcs_initialize(struct vmcs *vmcs, uintptr_t vmcs_pa) +{ + int err; + + /* set to VMCS revision */ + vmcs->identifier = VMX_BASIC_REVISION(rdmsr(MSR_VMX_BASIC)); + + /* + * Perform a vmclear on the VMCS, but without the critical section + * manipulation as done by vmcs_clear() above. + */ + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (err) + : [addr] "m" (vmcs_pa) + : "memory"); + + if (err != 0) { + panic("vmclear(%p) error %d", (void *)vmcs_pa, err); + } +} + +void +vmcs_load(uintptr_t vmcs_pa) +{ + int err; + + /* + * While the VMCS is loaded on the CPU for subsequent operations, it is + * important that the thread not be preempted. That is ensured with + * critical_enter() here, with a matching critical_exit() call in + * vmcs_clear() once the VMCS is unloaded. + */ + critical_enter(); + + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (err) + : [addr] "m" (vmcs_pa) + : "memory"); + + if (err != 0) { + panic("vmptrld(%p) error %d", (void *)vmcs_pa, err); + } +} + +uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + __asm __volatile("vmread %[enc], %[val];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (error), [val] "=r" (val) + : [enc] "r" ((uint64_t)encoding) + : "memory"); + + if (error != 0) { + panic("vmread(%x) error %d", encoding, error); + } + + return (val); +} + +void +vmcs_write(uint32_t encoding, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %[val], %[enc];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (error) + : [val] "r" (val), [enc] "r" ((uint64_t)encoding) + : "memory"); + + if (error != 0) { + panic("vmwrite(%x, %lx) error %d", encoding, val, error); + } +} diff --git a/usr/src/uts/intel/io/vmm/intel/vmcs.h b/usr/src/uts/intel/io/vmm/intel/vmcs.h new file mode 100644 index 0000000000..d61244baee --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmcs.h @@ -0,0 +1,387 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2017 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +#ifndef _ASM +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof (uint32_t) * 2]; +}; +CTASSERT(sizeof (struct vmcs) == PAGE_SIZE); + +uint32_t vmcs_field_encoding(int ident); +void vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, + uint32_t *acc); + +void vmcs_initialize(struct vmcs *vmcs, uintptr_t vmcs_pa); + +void vmcs_load(uintptr_t vmcs_pa); +void vmcs_clear(uintptr_t vmcs_pa); + +uint64_t vmcs_read(uint32_t encoding); +void vmcs_write(uint32_t encoding, uint64_t val); + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) + +#endif /* _ASM */ +#endif /* _KERNEL */ + +#define VMCS_INITIAL 0xffffffffffffffff + +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 +#define EXIT_REASON_PM_LOG_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1 << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0 << 8) +#define VMCS_INTR_T_NMI (2 << 8) +#define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) +#define VMCS_INTR_DEL_ERRCODE (1 << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + +#endif diff --git a/usr/src/uts/intel/io/vmm/intel/vmx.c b/usr/src/uts/intel/io/vmm/intel/vmx.c new file mode 100644 index 0000000000..a44c90dcbe --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx.c @@ -0,0 +1,3772 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#include <sys/x86_archext.h> +#include <sys/smp_impldefs.h> +#include <sys/smt.h> +#include <sys/hma.h> +#include <sys/trap.h> +#include <sys/archsystm.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/reg.h> +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/vmparam.h> +#include <sys/vmm_vm.h> +#include <sys/vmm_kernel.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <sys/vmm_instruction_emul.h> +#include "vmm_lapic.h" +#include "vmm_host.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "vmcs.h" +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" +#include "vmx_controls.h" + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +/* We consider TSC offset a necessity for unsynched TSC handling */ +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_TSC_OFFSET | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) + +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +/* + * EPT and Unrestricted Guest are considered necessities. The latter is not a + * requirement on FreeBSD, where grub2-bhyve is used to load guests directly + * without a bootrom starting in real mode. + */ +#define PROCBASED_CTLS2_ONE_SETTING \ + (PROCBASED2_ENABLE_EPT | \ + PROCBASED2_UNRESTRICTED_GUEST) +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_HOST_LMA | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_LOAD_EFER | \ + VM_EXIT_ACKNOWLEDGE_INTERRUPT) + +#define VM_EXIT_CTLS_ZERO_SETTING 0 + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_LOAD_EFER) + +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +/* + * Cover the EPT capabilities used by bhyve at present: + * - 4-level page walks + * - write-back memory type + * - INVEPT operations (all types) + * - INVVPID operations (single-context only) + */ +#define EPT_CAPS_REQUIRED \ + (IA32_VMX_EPT_VPID_PWL4 | \ + IA32_VMX_EPT_VPID_TYPE_WB | \ + IA32_VMX_EPT_VPID_INVEPT | \ + IA32_VMX_EPT_VPID_INVEPT_SINGLE | \ + IA32_VMX_EPT_VPID_INVEPT_ALL | \ + IA32_VMX_EPT_VPID_INVVPID | \ + IA32_VMX_EPT_VPID_INVVPID_SINGLE) + +#define HANDLED 1 +#define UNHANDLED 0 + +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; + +static uint64_t cr4_ones_mask, cr4_zeros_mask; + +static int vmx_initialized; + +/* Do not flush RSB upon vmexit */ +static int no_flush_rsb; + +/* + * Optional capabilities + */ + +/* HLT triggers a VM-exit */ +static int cap_halt_exit; + +/* PAUSE triggers a VM-exit */ +static int cap_pause_exit; + +/* Monitor trap flag */ +static int cap_monitor_trap; + +/* Guests are allowed to use INVPCID */ +static int cap_invpcid; + +/* Extra capabilities (VMX_CAP_*) beyond the minimum */ +static enum vmx_caps vmx_capabilities; + +/* APICv posted interrupt vector */ +static int pirvec = -1; + +static uint_t vpid_alloc_failed; + +int guest_l1d_flush; +int guest_l1d_flush_sw; + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; +}; + +static struct msr_entry msr_load_list[1] __aligned(16); + +/* + * The definitions of SDT probes for VMX. + */ + +/* BEGIN CSTYLED */ +SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, + "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, + "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, return, + "struct vmx *", "int", "struct vm_exit *", "int"); +/* END CSTYLED */ + +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); +static void vmx_apply_tsc_adjust(struct vmx *, int); +static void vmx_apicv_sync_tmr(struct vlapic *vlapic); +static void vmx_tpr_shadow_enter(struct vlapic *vlapic); +static void vmx_tpr_shadow_exit(struct vlapic *vlapic); + +static void +vmx_allow_x2apic_msrs(struct vmx *vmx, int vcpuid) +{ + /* + * Allow readonly access to the following x2APIC MSRs from the guest. + */ + guest_msr_ro(vmx, vcpuid, MSR_APIC_ID); + guest_msr_ro(vmx, vcpuid, MSR_APIC_VERSION); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LDR); + guest_msr_ro(vmx, vcpuid, MSR_APIC_SVR); + + for (uint_t i = 0; i < 8; i++) { + guest_msr_ro(vmx, vcpuid, MSR_APIC_ISR0 + i); + guest_msr_ro(vmx, vcpuid, MSR_APIC_TMR0 + i); + guest_msr_ro(vmx, vcpuid, MSR_APIC_IRR0 + i); + } + + guest_msr_ro(vmx, vcpuid, MSR_APIC_ESR); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_TIMER); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_THERMAL); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_PCINT); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_LINT0); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_LINT1); + guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_ERROR); + guest_msr_ro(vmx, vcpuid, MSR_APIC_ICR_TIMER); + guest_msr_ro(vmx, vcpuid, MSR_APIC_DCR_TIMER); + guest_msr_ro(vmx, vcpuid, MSR_APIC_ICR); + + /* + * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. + * + * These registers get special treatment described in the section + * "Virtualizing MSR-Based APIC Accesses". + */ + guest_msr_rw(vmx, vcpuid, MSR_APIC_TPR); + guest_msr_rw(vmx, vcpuid, MSR_APIC_EOI); + guest_msr_rw(vmx, vcpuid, MSR_APIC_SELF_IPI); +} + +static ulong_t +vmx_fix_cr0(ulong_t cr0) +{ + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +/* + * Given a live (VMCS-active) cr0 value, and its shadow counterpart, calculate + * the value observable from the guest. + */ +static ulong_t +vmx_unshadow_cr0(uint64_t cr0, uint64_t shadow) +{ + return ((cr0 & ~cr0_ones_mask) | + (shadow & (cr0_zeros_mask | cr0_ones_mask))); +} + +static ulong_t +vmx_fix_cr4(ulong_t cr4) +{ + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +/* + * Given a live (VMCS-active) cr4 value, and its shadow counterpart, calculate + * the value observable from the guest. + */ +static ulong_t +vmx_unshadow_cr4(uint64_t cr4, uint64_t shadow) +{ + return ((cr4 & ~cr4_ones_mask) | + (shadow & (cr4_zeros_mask | cr4_ones_mask))); +} + +static void +vpid_free(int vpid) +{ + if (vpid < 0 || vpid > 0xffff) + panic("vpid_free: invalid vpid %d", vpid); + + /* + * VPIDs [0,VM_MAXCPU] are special and are not allocated from + * the unit number allocator. + */ + + if (vpid > VM_MAXCPU) + hma_vmx_vpid_free((uint16_t)vpid); +} + +static void +vpid_alloc(uint16_t *vpid, int num) +{ + int i, x; + + if (num <= 0 || num > VM_MAXCPU) + panic("invalid number of vpids requested: %d", num); + + /* + * If the "enable vpid" execution control is not enabled then the + * VPID is required to be 0 for all vcpus. + */ + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { + for (i = 0; i < num; i++) + vpid[i] = 0; + return; + } + + /* + * Allocate a unique VPID for each vcpu from the unit number allocator. + */ + for (i = 0; i < num; i++) { + uint16_t tmp; + + tmp = hma_vmx_vpid_alloc(); + x = (tmp == 0) ? -1 : tmp; + + if (x == -1) + break; + else + vpid[i] = x; + } + + if (i < num) { + atomic_add_int(&vpid_alloc_failed, 1); + + /* + * If the unit number allocator does not have enough unique + * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. + * + * These VPIDs are not be unique across VMs but this does not + * affect correctness because the combined mappings are also + * tagged with the EP4TA which is unique for each VM. + * + * It is still sub-optimal because the invvpid will invalidate + * combined mappings for a particular VPID across all EP4TAs. + */ + while (i-- > 0) + vpid_free(vpid[i]); + + for (i = 0; i < num; i++) + vpid[i] = i + 1; + } +} + +static int +vmx_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static void +vmx_restore(void) +{ + /* No-op on illumos */ +} + +static int +vmx_init(void) +{ + int error; + uint64_t fixed0, fixed1; + uint32_t tmp; + enum vmx_caps avail_caps = VMX_CAP_NONE; + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, + 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check for APIC virtualization capabilities: + * - TPR shadowing + * - Full APICv (with or without x2APIC support) + * - Posted interrupt handling + */ + if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0) { + avail_caps |= VMX_CAP_TPR_SHADOW; + + const uint32_t apicv_bits = + PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY; + if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, apicv_bits, 0, &tmp) == 0) { + avail_caps |= VMX_CAP_APICV; + + /* + * It may make sense in the future to differentiate + * hardware (or software) configurations with APICv but + * no support for accelerating x2APIC mode. + */ + avail_caps |= VMX_CAP_APICV_X2APIC; + + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_POSTED_INTERRUPT, 0, &tmp); + if (error == 0) { + /* + * If the PSM-provided interfaces for requesting + * and using a PIR IPI vector are present, use + * them for posted interrupts. + */ + if (psm_get_pir_ipivect != NULL && + psm_send_pir_ipi != NULL) { + pirvec = psm_get_pir_ipivect(); + avail_caps |= VMX_CAP_APICV_PIR; + } + } + } + } + + /* + * Check for necessary EPT capabilities + * + * TODO: Properly handle when IA32_VMX_EPT_VPID_HW_AD is missing and the + * hypervisor intends to utilize dirty page tracking. + */ + uint64_t ept_caps = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((ept_caps & EPT_CAPS_REQUIRED) != EPT_CAPS_REQUIRED) { + cmn_err(CE_WARN, "!Inadequate EPT capabilities: %lx", ept_caps); + return (EINVAL); + } + +#ifdef __FreeBSD__ + guest_l1d_flush = (cpu_ia32_arch_caps & + IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } +#else + /* L1D flushing is taken care of by smt_acquire() and friends */ + guest_l1d_flush = 0; +#endif /* __FreeBSD__ */ + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * Since Unrestricted Guest was already verified present, CR0_PE and + * CR0_PG are allowed to be set to zero in VMX non-root operation + */ + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + vmx_msr_init(); + + vmx_capabilities = avail_caps; + vmx_initialized = 1; + + return (0); +} + +static void +vmx_trigger_hostintr(int vector) +{ + VERIFY(vector >= 32 && vector <= 255); + vmx_call_isr(vector - 32); +} + +static void * +vmx_vminit(struct vm *vm) +{ + uint16_t vpid[VM_MAXCPU]; + int i, error, datasel; + struct vmx *vmx; + uint32_t exc_bitmap; + uint16_t maxcpus; + uint32_t proc_ctls, proc2_ctls, pin_ctls; + uint64_t apic_access_pa = UINT64_MAX; + + vmx = malloc(sizeof (struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + vmx->eptp = vmspace_table_root(vm_get_vmspace(vm)); + + /* + * Clean up EP4TA-tagged guest-physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + hma_vmx_invept_allcpus((uintptr_t)vmx->eptp); + + vmx_msr_bitmap_initialize(vmx); + + vpid_alloc(vpid, VM_MAXCPU); + + /* Grab the established defaults */ + proc_ctls = procbased_ctls; + proc2_ctls = procbased_ctls2; + pin_ctls = pinbased_ctls; + /* For now, default to the available capabilities */ + vmx->vmx_caps = vmx_capabilities; + + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { + proc_ctls |= PROCBASED_USE_TPR_SHADOW; + proc_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + proc_ctls &= ~PROCBASED_CR8_STORE_EXITING; + } + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + ASSERT(vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)); + + proc2_ctls |= (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + /* + * Allocate a page of memory to back the APIC access address for + * when APICv features are in use. Guest MMIO accesses should + * never actually reach this page, but rather be intercepted. + */ + vmx->apic_access_page = kmem_zalloc(PAGESIZE, KM_SLEEP); + VERIFY3U((uintptr_t)vmx->apic_access_page & PAGEOFFSET, ==, 0); + apic_access_pa = vtophys(vmx->apic_access_page); + + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + apic_access_pa); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { + ASSERT(vmx_cap_en(vmx, VMX_CAP_APICV)); + + pin_ctls |= PINBASED_POSTED_INTERRUPT; + } + + maxcpus = vm_get_maxcpus(vm); + datasel = vmm_get_host_datasel(); + for (i = 0; i < maxcpus; i++) { + /* + * Cache physical address lookups for various components which + * may be required inside the critical_enter() section implied + * by VMPTRLD() below. + */ + vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap[i]); + vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]); + vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]); + + vmx->vmcs_pa[i] = (uintptr_t)vtophys(&vmx->vmcs[i]); + vmcs_initialize(&vmx->vmcs[i], vmx->vmcs_pa[i]); + + vmx_msr_guest_init(vmx, i); + + vmcs_load(vmx->vmcs_pa[i]); + + vmcs_write(VMCS_HOST_IA32_PAT, vmm_get_host_pat()); + vmcs_write(VMCS_HOST_IA32_EFER, vmm_get_host_efer()); + + /* Load the control registers */ + vmcs_write(VMCS_HOST_CR0, vmm_get_host_cr0()); + vmcs_write(VMCS_HOST_CR4, vmm_get_host_cr4() | CR4_VMXE); + + /* Load the segment selectors */ + vmcs_write(VMCS_HOST_CS_SELECTOR, vmm_get_host_codesel()); + + vmcs_write(VMCS_HOST_ES_SELECTOR, datasel); + vmcs_write(VMCS_HOST_SS_SELECTOR, datasel); + vmcs_write(VMCS_HOST_DS_SELECTOR, datasel); + + vmcs_write(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel()); + vmcs_write(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel()); + vmcs_write(VMCS_HOST_TR_SELECTOR, vmm_get_host_tsssel()); + + /* + * Configure host sysenter MSRs to be restored on VM exit. + * The thread-specific MSR_INTC_SEP_ESP value is loaded in + * vmx_run. + */ + vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL); + vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, + rdmsr(MSR_SYSENTER_EIP_MSR)); + + /* instruction pointer */ + if (no_flush_rsb) { + vmcs_write(VMCS_HOST_RIP, (uint64_t)vmx_exit_guest); + } else { + vmcs_write(VMCS_HOST_RIP, + (uint64_t)vmx_exit_guest_flush_rsb); + } + + /* link pointer */ + vmcs_write(VMCS_LINK_POINTER, ~0); + + vmcs_write(VMCS_EPTP, vmx->eptp); + vmcs_write(VMCS_PIN_BASED_CTLS, pin_ctls); + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc2_ctls); + vmcs_write(VMCS_EXIT_CTLS, exit_ctls); + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + vmcs_write(VMCS_MSR_BITMAP, msr_bitmap_pa); + vmcs_write(VMCS_VPID, vpid[i]); + + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, + vtophys(&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } + + /* exception bitmap */ + if (vcpu_trace_exceptions(vm, i)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + vmcs_write(VMCS_EXCEPTION_BITMAP, exc_bitmap); + + vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; + vmcs_write(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); + + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { + vmcs_write(VMCS_VIRTUAL_APIC, apic_page_pa); + } + + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + vmcs_write(VMCS_APIC_ACCESS, apic_access_pa); + vmcs_write(VMCS_EOI_EXIT0, 0); + vmcs_write(VMCS_EOI_EXIT1, 0); + vmcs_write(VMCS_EOI_EXIT2, 0); + vmcs_write(VMCS_EOI_EXIT3, 0); + } + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { + vmcs_write(VMCS_PIR_VECTOR, pirvec); + vmcs_write(VMCS_PIR_DESC, pir_desc_pa); + } + + /* + * Set up the CR0/4 masks and configure the read shadow state + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + vmcs_write(VMCS_CR0_MASK, cr0_ones_mask | cr0_zeros_mask); + vmcs_write(VMCS_CR0_SHADOW, 0x60000010); + vmcs_write(VMCS_CR4_MASK, cr4_ones_mask | cr4_zeros_mask); + vmcs_write(VMCS_CR4_SHADOW, 0); + + vmcs_clear(vmx->vmcs_pa[i]); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = proc_ctls; + vmx->cap[i].proc_ctls2 = proc2_ctls; + vmx->cap[i].exc_bitmap = exc_bitmap; + + vmx->state[i].nextrip = ~0; + vmx->state[i].lastcpu = NOCPU; + vmx->state[i].vpid = vpid[i]; + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) +{ + int handled; + + handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax, + (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, + (uint64_t *)&vmxctx->guest_rdx); + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); +#endif +} + +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof (struct invvpid_desc) == 16); + +static __inline void +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + DTRACE_PROBE3(vmx__invvpid, uint64_t, type, uint16_t, desc.vpid, + uint64_t, desc.linear_addr); + + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) { + panic("invvpid error %d", error); + } +} + +/* + * Invalidate guest mappings identified by its VPID from the TLB. + * + * This is effectively a flush of the guest TLB, removing only "combined + * mappings" (to use the VMX parlance). Actions which modify the EPT structures + * for the instance (such as unmapping GPAs) would require an 'invept' flush. + */ +static void +vmx_invvpid(struct vmx *vmx, int vcpu, int running) +{ + struct vmxstate *vmxstate; + struct vmspace *vms; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->vpid == 0) { + return; + } + + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } + + /* + * Invalidate all mappings tagged with 'vpid' + * + * This is done when a vCPU moves between host CPUs, where there may be + * stale TLB entries for this VPID on the target, or if emulated actions + * in the guest CPU have incurred an explicit TLB flush. + */ + vms = vm_get_vmspace(vmx->vm); + if (vmspace_table_gen(vms) == vmx->eptgen[curcpu]) { + struct invvpid_desc invvpid_desc = { + .vpid = vmxstate->vpid, + .linear_addr = 0, + ._res1 = 0, + ._res2 = 0, + }; + + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The INVVPID can be skipped if an INVEPT is going to be + * performed before entering the guest. The INVEPT will + * invalidate combined mappings for the EP4TA associated with + * this guest, in all VPIDs. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); + } +} + +static __inline void +invept(uint64_t type, uint64_t eptp) +{ + int error; + struct invept_desc { + uint64_t eptp; + uint64_t _resv; + } desc = { eptp, 0 }; + + DTRACE_PROBE2(vmx__invept, uint64_t, type, uint64_t, eptp); + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE_ASM + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error != 0) { + panic("invvpid error %d", error); + } +} + +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +{ + struct vmxstate *vmxstate; + + /* + * Regardless of whether the VM appears to have migrated between CPUs, + * save the host sysenter stack pointer. As it points to the kernel + * stack of each thread, the correct value must be maintained for every + * trip into the critical section. + */ + vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); + + /* + * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or + * migration between host CPUs with differing TSC values. + */ + vmx_apply_tsc_adjust(vmx, vcpu); + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + /* Load the per-CPU IDT address */ + vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase()); + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, 1); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static __inline void +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); + } +} + +static __inline void +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); +} + +static __inline bool +vmx_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + return ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0); +} + +static __inline void +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + if (!vmx_nmi_window_exiting(vmx, vcpu)) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + } +} + +static __inline void +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + ASSERT(vmx_nmi_window_exiting(vmx, vcpu)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); +} + +/* + * Set the TSC adjustment, taking into account the offsets measured between + * host physical CPUs. This is required even if the guest has not set a TSC + * offset since vCPUs inherit the TSC offset of whatever physical CPU it has + * migrated onto. Without this mitigation, un-synched host TSCs will convey + * the appearance of TSC time-travel to the guest as its vCPUs migrate. + */ +static void +vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu) +{ + const uint64_t offset = vcpu_tsc_offset(vmx->vm, vcpu, true); + + ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET); + + if (vmx->tsc_offset_active[vcpu] != offset) { + vmcs_write(VMCS_TSC_OFFSET, offset); + vmx->tsc_offset_active[vcpu] = offset; + } +} + +#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +static void +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + ASSERT0(vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & NMI_BLOCKING); + ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID); + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + vmcs_write(VMCS_ENTRY_INTR_INFO, + IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID); + + /* Clear the request */ + vm_nmi_clear(vmx->vm, vcpu); +} + +/* + * Inject exceptions, NMIs, and ExtINTs. + * + * The logic behind these are complicated and may involve mutex contention, so + * the injection is performed without the protection of host CPU interrupts + * being disabled. This means a racing notification could be "lost", + * necessitating a later call to vmx_inject_recheck() to close that window + * of opportunity. + */ +static enum event_inject_state +vmx_inject_events(struct vmx *vmx, int vcpu, uint64_t rip) +{ + uint64_t entryinfo; + uint32_t gi, info; + int vector; + enum event_inject_state state; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + state = EIS_CAN_INJECT; + + /* Clear any interrupt blocking if the guest %rip has changed */ + if (vmx->state[vcpu].nextrip != rip && (gi & HWINTR_BLOCKING) != 0) { + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + + /* + * It could be that an interrupt is already pending for injection from + * the VMCS. This would be the case if the vCPU exited for conditions + * such as an AST before a vm-entry delivered the injection. + */ + if ((info & VMCS_INTR_VALID) != 0) { + return (EIS_EV_EXISTING | EIS_REQ_EXIT); + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + ASSERT(entryinfo & VMCS_INTR_VALID); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) { + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + } + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + state = EIS_EV_INJECTED; + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + /* + * If there are no conditions blocking NMI injection then inject + * it directly here otherwise enable "NMI window exiting" to + * inject it as soon as we can. + * + * According to the Intel manual, some CPUs do not allow NMI + * injection when STI_BLOCKING is active. That check is + * enforced here, regardless of CPU capability. If running on a + * CPU without such a restriction it will immediately exit and + * the NMI will be injected in the "NMI window exiting" handler. + */ + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + if (state == EIS_CAN_INJECT) { + vmx_inject_nmi(vmx, vcpu); + state = EIS_EV_INJECTED; + } else { + return (state | EIS_REQ_EXIT); + } + } else { + vmx_set_nmi_window_exiting(vmx, vcpu); + } + } + + if (vm_extint_pending(vmx->vm, vcpu)) { + if (state != EIS_CAN_INJECT) { + return (state | EIS_REQ_EXIT); + } + if ((gi & HWINTR_BLOCKING) != 0 || + (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) { + return (EIS_GI_BLOCK); + } + + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + + /* Inject the interrupt */ + vmcs_write(VMCS_ENTRY_INTR_INFO, + VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector); + + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + state = EIS_EV_INJECTED; + } + + return (state); +} + +/* + * Inject any interrupts pending on the vLAPIC. + * + * This is done with host CPU interrupts disabled so notification IPIs, either + * from the standard vCPU notification or APICv posted interrupts, will be + * queued on the host APIC and recognized when entering VMX context. + */ +static enum event_inject_state +vmx_inject_vlapic(struct vmx *vmx, int vcpu, struct vlapic *vlapic) +{ + int vector; + + if (!vlapic_pending_intr(vlapic, &vector)) { + return (EIS_CAN_INJECT); + } + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + uint16_t status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + uint16_t status_new = (status_old & 0xff00) | vector; + + /* + * The APICv state will have been synced into the vLAPIC + * as part of vlapic_pending_intr(). Prepare the VMCS + * for the to-be-injected pending interrupt. + */ + if (status_new > status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, status_new); + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, + "vmx_inject_interrupts: guest_intr_status " + "changed from 0x%04x to 0x%04x", + status_old, status_new); + } + + /* + * Ensure VMCS state regarding EOI traps is kept in sync + * with the TMRs in the vlapic. + */ + vmx_apicv_sync_tmr(vlapic); + + /* + * The rest of the injection process for injecting the + * interrupt(s) is handled by APICv. It does not preclude other + * event injection from occurring. + */ + return (EIS_CAN_INJECT); + } + + ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID); + + /* Does guest interruptability block injection? */ + if ((vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & HWINTR_BLOCKING) != 0 || + (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) { + return (EIS_GI_BLOCK); + } + + /* Inject the interrupt */ + vmcs_write(VMCS_ENTRY_INTR_INFO, + VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector); + + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + + return (EIS_EV_INJECTED); +} + +/* + * Re-check for events to be injected. + * + * Once host CPU interrupts are disabled, check for the presence of any events + * which require injection processing. If an exit is required upon injection, + * or once the guest becomes interruptable, that will be configured too. + */ +static bool +vmx_inject_recheck(struct vmx *vmx, int vcpu, enum event_inject_state state) +{ + if (state == EIS_CAN_INJECT) { + if (vm_nmi_pending(vmx->vm, vcpu) && + !vmx_nmi_window_exiting(vmx, vcpu)) { + /* queued NMI not blocked by NMI-window-exiting */ + return (true); + } + if (vm_extint_pending(vmx->vm, vcpu)) { + /* queued ExtINT not blocked by existing injection */ + return (true); + } + } else { + if ((state & EIS_REQ_EXIT) != 0) { + /* + * Use a self-IPI to force an immediate exit after + * event injection has occurred. + */ + poke_cpu(CPU->cpu_id); + } else { + /* + * If any event is being injected, an exit immediately + * upon becoming interruptable again will allow pending + * or newly queued events to be injected in a timely + * manner. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } + } + return (false); +} + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vmx->ctx[vcpu]; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || + !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + +static uint64_t +vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) +{ + const struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + return (vmxctx->guest_rax); + case 1: + return (vmxctx->guest_rcx); + case 2: + return (vmxctx->guest_rdx); + case 3: + return (vmxctx->guest_rbx); + case 4: + return (vmcs_read(VMCS_GUEST_RSP)); + case 5: + return (vmxctx->guest_rbp); + case 6: + return (vmxctx->guest_rsi); + case 7: + return (vmxctx->guest_rdi); + case 8: + return (vmxctx->guest_r8); + case 9: + return (vmxctx->guest_r9); + case 10: + return (vmxctx->guest_r10); + case 11: + return (vmxctx->guest_r11); + case 12: + return (vmxctx->guest_r12); + case 13: + return (vmxctx->guest_r13); + case 14: + return (vmxctx->guest_r14); + case 15: + return (vmxctx->guest_r15); + default: + panic("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) +{ + struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + vmxctx->guest_rax = regval; + break; + case 1: + vmxctx->guest_rcx = regval; + break; + case 2: + vmxctx->guest_rdx = regval; + break; + case 3: + vmxctx->guest_rbx = regval; + break; + case 4: + vmcs_write(VMCS_GUEST_RSP, regval); + break; + case 5: + vmxctx->guest_rbp = regval; + break; + case 6: + vmxctx->guest_rsi = regval; + break; + case 7: + vmxctx->guest_rdi = regval; + break; + case 8: + vmxctx->guest_r8 = regval; + break; + case 9: + vmxctx->guest_r9 = regval; + break; + case 10: + vmxctx->guest_r10 = regval; + break; + case 11: + vmxctx->guest_r11 = regval; + break; + case 12: + vmxctx->guest_r12 = regval; + break; + case 13: + vmxctx->guest_r13 = regval; + break; + case 14: + vmxctx->guest_r14 = regval; + break; + case 15: + vmxctx->guest_r15 = regval; + break; + default: + panic("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + + const uint64_t old = vmcs_read(VMCS_GUEST_CR0); + const uint64_t diff = crval ^ old; + /* Flush the TLB if the paging or write-protect bits are changing */ + if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) { + vmx_invvpid(vmx, vcpu, 1); + } + + vmcs_write(VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entry_ctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(VMCS_GUEST_IA32_EFER, efer); + entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); + entry_ctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vmx->vm, vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vmx, vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(void) +{ + uint32_t csar; + + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(void) +{ + + if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_mmio_emul(struct vm_exit *vmexit, struct vie *vie, uint64_t gpa, + uint64_t gla) +{ + struct vm_guest_paging paging; + uint32_t csar; + + vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; + vmexit->inst_length = 0; + vmexit->u.mmio_emul.gpa = gpa; + vmexit->u.mmio_emul.gla = gla; + vmx_paging_info(&paging); + + switch (paging.cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.mmio_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.mmio_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.mmio_emul.cs_base = 0; + vmexit->u.mmio_emul.cs_d = 0; + break; + } + + vie_init_mmio(vie, NULL, 0, &paging, gpa); +} + +static void +vmexit_inout(struct vm_exit *vmexit, struct vie *vie, uint64_t qual, + uint32_t eax) +{ + struct vm_guest_paging paging; + struct vm_inout *inout; + + inout = &vmexit->u.inout; + + inout->bytes = (qual & 0x7) + 1; + inout->flags = 0; + inout->flags |= (qual & 0x8) ? INOUT_IN : 0; + inout->flags |= (qual & 0x10) ? INOUT_STR : 0; + inout->flags |= (qual & 0x20) ? INOUT_REP : 0; + inout->port = (uint16_t)(qual >> 16); + inout->eax = eax; + if (inout->flags & INOUT_STR) { + uint64_t inst_info; + + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + + /* + * According to the SDM, bits 9:7 encode the address size of the + * ins/outs operation, but only values 0/1/2 are expected, + * corresponding to 16/32/64 bit sizes. + */ + inout->addrsize = 2 << BITX(inst_info, 9, 7); + VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || + inout->addrsize == 8); + + if (inout->flags & INOUT_IN) { + /* + * The bits describing the segment in INSTRUCTION_INFO + * are not defined for ins, leaving it to system + * software to assume %es (encoded as 0) + */ + inout->segment = 0; + } else { + /* + * Bits 15-17 encode the segment for OUTS. + * This value follows the standard x86 segment order. + */ + inout->segment = (inst_info >> 15) & 0x7; + } + } + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmx_paging_info(&paging); + vie_init_inout(vie, inout, vmexit->inst_length, &paging); + + /* The in/out emulation will handle advancing %rip */ + vmexit->inst_length = 0; +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = PROT_EXEC; + else + fault_type = PROT_READ; + + return (fault_type); +} + +static bool +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (false); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (false); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (false); + } + + return (true); +} + +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + const uint_t offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + const uint32_t *apic_regs = + (uint32_t *)(vlapic->apic_page); + const uint32_t vector = + apic_regs[APIC_OFFSET_SELF_IPI / 4]; + + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + vlapic_icrlo_write_handler(vlapic); + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + return (UNHANDLED); + } + return (HANDLED); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + struct vie *vie; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vie = vm_vie_ctx(vmx->vm, vcpuid); + vmexit_mmio_emul(vmexit, vie, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + +static int +vmx_handle_msr(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit, + bool is_wrmsr) +{ + struct vmxctx *vmxctx = &vmx->ctx[vcpuid]; + const uint32_t ecx = vmxctx->guest_rcx; + vm_msr_result_t res; + uint64_t val = 0; + + if (is_wrmsr) { + vmm_stat_incr(vmx->vm, vcpuid, VMEXIT_WRMSR, 1); + val = vmxctx->guest_rdx << 32 | (uint32_t)vmxctx->guest_rax; + + if (vlapic_owned_msr(ecx)) { + struct vlapic *vlapic = vm_lapic(vmx->vm, vcpuid); + + res = vlapic_wrmsr(vlapic, ecx, val); + } else { + res = vmx_wrmsr(vmx, vcpuid, ecx, val); + } + } else { + vmm_stat_incr(vmx->vm, vcpuid, VMEXIT_RDMSR, 1); + + if (vlapic_owned_msr(ecx)) { + struct vlapic *vlapic = vm_lapic(vmx->vm, vcpuid); + + res = vlapic_rdmsr(vlapic, ecx, &val); + } else { + res = vmx_rdmsr(vmx, vcpuid, ecx, &val); + } + } + + switch (res) { + case VMR_OK: + /* Store rdmsr result in the appropriate registers */ + if (!is_wrmsr) { + vmxctx->guest_rax = (uint32_t)val; + vmxctx->guest_rdx = val >> 32; + } + return (HANDLED); + case VMR_GP: + vm_inject_gp(vmx->vm, vcpuid); + return (HANDLED); + case VMR_UNHANLDED: + vmexit->exitcode = is_wrmsr ? + VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + return (UNHANDLED); + default: + panic("unexpected msr result %u\n", res); + } +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int error, errcode, errcode_valid, handled; + struct vmxctx *vmxctx; + struct vie *vie; + struct vlapic *vlapic; + struct vm_task_switch *ts; + uint32_t idtvec_info, idtvec_err, intr_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + vmxctx = &vmx->ctx[vcpu]; + + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (reason == EXIT_REASON_MCE_DURING_ENTRY) { + VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); + vmm_call_trap(T_MCE); + return (1); + } + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vmx, vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vmx, vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + case EXIT_REASON_WRMSR: + handled = vmx_handle_msr(vmx, vcpu, vmexit, + reason == EXIT_REASON_WRMSR); + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); + vmx_clear_int_window_exiting(vmx, vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + SDT_PROBE4(vmm, vmx, exit, interrupt, + vmx, vcpu, vmexit, intr_info); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %x", intr_info)); + vmx_trigger_hostintr(intr_info & 0xff); + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vmx->vm, vcpu)) + vmx_inject_nmi(vmx, vcpu); + vmx_clear_nmi_window_exiting(vmx, vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); + vie = vm_vie_ctx(vmx->vm, vcpu); + vmexit_inout(vmexit, vie, qual, (uint32_t)vmxctx->guest_rax); + SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); + handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %x", intr_info)); + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_vec != IDT_DF) && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if (intr_type == VMCS_INTR_T_NMI) + return (1); + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); + vmm_call_trap(T_MCE); + return (1); + } + + /* + * If the hypervisor has requested user exits for + * debug exceptions, bounce them out to userland. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION && + intr_vec == IDT_BP && + (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { + vmexit->exitcode = VM_EXITCODE_BPT; + vmexit->u.bpt.inst_length = vmexit->inst_length; + vmexit->inst_length = 0; + break; + } + + if (intr_vec == IDT_PF) { + vmxctx->guest_cr2 = qual; + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%x into " + "the guest", intr_vec, errcode); + SDT_PROBE5(vmm, vmx, exit, exception, + vmx, vcpu, vmexit, intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ + gpa = vmcs_gpa(); + if (vm_mem_allocated(vmx->vm, vcpu, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + SDT_PROBE5(vmm, vmx, exit, nestedfault, + vmx, vcpu, vmexit, gpa, qual); + } else if (ept_emulation_fault(qual)) { + vie = vm_vie_ctx(vmx->vm, vcpu); + vmexit_mmio_emul(vmexit, vie, gpa, vmcs_gla()); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MMIO_EMUL, 1); + SDT_PROBE4(vmm, vmx, exit, mmiofault, + vmx, vcpu, vmexit, gpa); + } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + SDT_PROBE4(vmm, vmx, exit, apicwrite, + vmx, vcpu, vmexit, vlapic); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case EXIT_REASON_TPR: + vlapic = vm_lapic(vmx->vm, vcpu); + vlapic_sync_tpr(vlapic); + vmexit->inst_length = 0; + handled = HANDLED; + break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; + default: + SDT_PROBE4(vmm, vmx, exit, unknown, + vmx, vcpu, vmexit, reason); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + + SDT_PROBE4(vmm, vmx, exit, return, + vmx, vcpu, vmexit, handled); + return (handled); +} + +static void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + case VMX_INVEPT_ERROR: + case VMX_VMWRITE_ERROR: + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_possible_nmi(struct vm_exit *vmexit) +{ + ASSERT(!interrupts_enabled()); + + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EXCEPTION) { + uint32_t intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + ASSERT(intr_info & VMCS_INTR_VALID); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + ASSERT3U(intr_info & 0xff, ==, IDT_NMI); + vmm_call_trap(T_NMIFLT); + } + } +} + +static __inline void +vmx_dr_enter_guest(struct vmxctx *vmxctx) +{ + uint64_t rflags; + + /* Save host control debug registers. */ + vmxctx->host_dr7 = rdr7(); + vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR7 and DEBUGCTL are saved/restored in the VMCS. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* + * Disable single stepping the kernel to avoid corrupting the + * guest DR6. A debugger might still be able to corrupt the + * guest DR6 by setting a breakpoint after this point and then + * single stepping. + */ + rflags = read_rflags(); + vmxctx->host_tf = rflags & PSL_T; + write_rflags(rflags & ~PSL_T); + + /* Save host debug registers. */ + vmxctx->host_dr0 = rdr0(); + vmxctx->host_dr1 = rdr1(); + vmxctx->host_dr2 = rdr2(); + vmxctx->host_dr3 = rdr3(); + vmxctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(vmxctx->guest_dr0); + load_dr1(vmxctx->guest_dr1); + load_dr2(vmxctx->guest_dr2); + load_dr3(vmxctx->guest_dr3); + load_dr6(vmxctx->guest_dr6); +} + +static __inline void +vmx_dr_leave_guest(struct vmxctx *vmxctx) +{ + + /* Save guest debug registers. */ + vmxctx->guest_dr0 = rdr0(); + vmxctx->guest_dr1 = rdr1(); + vmxctx->guest_dr2 = rdr2(); + vmxctx->guest_dr3 = rdr3(); + vmxctx->guest_dr6 = rdr6(); + + /* + * Restore host debug registers. Restore DR7, DEBUGCTL, and + * PSL_T last. + */ + load_dr0(vmxctx->host_dr0); + load_dr1(vmxctx->host_dr1); + load_dr2(vmxctx->host_dr2); + load_dr3(vmxctx->host_dr3); + load_dr6(vmxctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); + load_dr7(vmxctx->host_dr7); + write_rflags(read_rflags() | vmxctx->host_tf); +} + +static int +vmx_run(void *arg, int vcpu, uint64_t rip) +{ + int rc, handled, launched; + struct vmx *vmx; + struct vm *vm; + struct vmxctx *vmxctx; + uintptr_t vmcs_pa; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint32_t exit_reason; + bool tpr_shadow_active; + vm_client_t *vmc; + + vmx = arg; + vm = vmx->vm; + vmcs_pa = vmx->vmcs_pa[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + vlapic = vm_lapic(vm, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vmc = vm_get_vmclient(vm, vcpu); + launched = 0; + tpr_shadow_active = vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) && + !vmx_cap_en(vmx, VMX_CAP_APICV) && + (vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0; + + vmx_msr_guest_enter(vmx, vcpu); + + vmcs_load(vmcs_pa); + + VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_LOADED; + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmx_vminit(). + */ + vmcs_write(VMCS_HOST_CR3, rcr3()); + + vmcs_write(VMCS_GUEST_RIP, rip); + vmx_set_pcpu_defaults(vmx, vcpu); + do { + enum event_inject_state inject_state; + uint64_t eptgen; + + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%lx/%lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + + /* + * Perform initial event/exception/interrupt injection before + * host CPU interrupts are disabled. + */ + inject_state = vmx_inject_events(vmx, vcpu, rip); + + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after vmx_inject_vlapic() will not be + * "lost" because it will be held pending in the host APIC + * because interrupts are disabled. The pending interrupt will + * be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by vmspace + * invalidation. + */ + disable_intr(); + + /* + * If not precluded by existing events, inject any interrupt + * pending on the vLAPIC. As a lock-less operation, it is safe + * (and prudent) to perform with host CPU interrupts disabled. + */ + if (inject_state == EIS_CAN_INJECT) { + inject_state = vmx_inject_vlapic(vmx, vcpu, vlapic); + } + + /* + * Check for vCPU bail-out conditions. This must be done after + * vmx_inject_events() to detect a triple-fault condition. + */ + if (vcpu_entry_bailout_checks(vmx->vm, vcpu, rip)) { + enable_intr(); + break; + } + + if (vcpu_run_state_pending(vm, vcpu)) { + enable_intr(); + vm_exit_run_state(vmx->vm, vcpu, rip); + break; + } + + /* + * If subsequent activity queued events which require injection + * handling, take another lap to handle them. + */ + if (vmx_inject_recheck(vmx, vcpu, inject_state)) { + enable_intr(); + handled = HANDLED; + continue; + } + + if ((rc = smt_acquire()) != 1) { + enable_intr(); + vmexit->rip = rip; + vmexit->inst_length = 0; + if (rc == -1) { + vmexit->exitcode = VM_EXITCODE_HT; + } else { + vmexit->exitcode = VM_EXITCODE_BOGUS; + handled = HANDLED; + } + break; + } + + /* + * If this thread has gone off-cpu due to mutex operations + * during vmx_run, the VMCS will have been unloaded, forcing a + * re-VMLAUNCH as opposed to VMRESUME. + */ + launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0; + /* + * Restoration of the GDT limit is taken care of by + * vmx_savectx(). Since the maximum practical index for the + * IDT is 255, restoring its limits from the post-VMX-exit + * default of 0xffff is not a concern. + * + * Only 64-bit hypervisor callers are allowed, which forgoes + * the need to restore any LDT descriptor. Toss an error to + * anyone attempting to break that rule. + */ + if (curproc->p_model != DATAMODEL_LP64) { + smt_release(); + enable_intr(); + bzero(vmexit, sizeof (*vmexit)); + vmexit->rip = rip; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_FAIL_INVALID; + handled = UNHANDLED; + break; + } + + if (tpr_shadow_active) { + vmx_tpr_shadow_enter(vlapic); + } + + /* + * Indicate activation of vmspace (EPT) table just prior to VMX + * entry, checking for the necessity of an invept invalidation. + */ + eptgen = vmc_table_enter(vmc); + if (vmx->eptgen[curcpu] != eptgen) { + /* + * VMspace generation does not match what was previously + * used on this host CPU, so all mappings associated + * with this EP4TA must be invalidated. + */ + invept(1, vmx->eptp); + vmx->eptgen[curcpu] = eptgen; + } + + vmx_run_trace(vmx, vcpu); + vcpu_ustate_change(vm, vcpu, VU_RUN); + vmx_dr_enter_guest(vmxctx); + + /* Perform VMX entry */ + rc = vmx_enter_guest(vmxctx, vmx, launched); + + vmx_dr_leave_guest(vmxctx); + vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); + + vmx->vmcs_state[vcpu] |= VS_LAUNCHED; + smt_release(); + + if (tpr_shadow_active) { + vmx_tpr_shadow_exit(vlapic); + } + + /* Collect some information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = rip; + + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_possible_nmi(vmexit); + } + enable_intr(); + vmc_table_exit(vmc); + + if (rc == VMX_GUEST_VMEXIT) { + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + vmx_exit_inst_error(vmxctx, rc, vmexit); + } + DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip, + uint32_t, exit_reason); + rip = vmexit->rip; + } while (handled); + + /* If a VM exit has been handled then the exitcode must be BOGUS */ + if (handled && vmexit->exitcode != VM_EXITCODE_BOGUS) { + panic("Non-BOGUS exitcode (%d) unexpected for handled VM exit", + vmexit->exitcode); + } + + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", + vmexit->exitcode); + + vmcs_clear(vmcs_pa); + vmx_msr_guest_exit(vmx, vcpu); + + VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_NONE; + + return (0); +} + +static void +vmx_vmcleanup(void *arg) +{ + int i; + struct vmx *vmx = arg; + uint16_t maxcpus; + + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + kmem_free(vmx->apic_access_page, PAGESIZE); + } else { + VERIFY3P(vmx->apic_access_page, ==, NULL); + } + + vmx_msr_bitmap_destroy(vmx); + + maxcpus = vm_get_maxcpus(vmx->vm); + for (i = 0; i < maxcpus; i++) + vpid_free(vmx->state[i].vpid); + + free(vmx, M_VMX); +} + +static uint64_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); + case VM_REG_GUEST_DR0: + return (&vmxctx->guest_dr0); + case VM_REG_GUEST_DR1: + return (&vmxctx->guest_dr1); + case VM_REG_GUEST_DR2: + return (&vmxctx->guest_dr2); + case VM_REG_GUEST_DR3: + return (&vmxctx->guest_dr3); + case VM_REG_GUEST_DR6: + return (&vmxctx->guest_dr6); + default: + break; + } + return (NULL); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + int running, hostcpu, err; + struct vmx *vmx = arg; + uint64_t *regp; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + /* VMCS access not required for ctx reads */ + if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) { + *retval = *regp; + return (0); + } + + if (!running) { + vmcs_load(vmx->vmcs_pa[vcpu]); + } + + err = 0; + if (reg == VM_REG_GUEST_INTR_SHADOW) { + uint64_t gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + } else { + uint32_t encoding; + + encoding = vmcs_field_encoding(reg); + switch (encoding) { + case VMCS_GUEST_CR0: + /* Take the shadow bits into account */ + *retval = vmx_unshadow_cr0(vmcs_read(encoding), + vmcs_read(VMCS_CR0_SHADOW)); + break; + case VMCS_GUEST_CR4: + /* Take the shadow bits into account */ + *retval = vmx_unshadow_cr4(vmcs_read(encoding), + vmcs_read(VMCS_CR4_SHADOW)); + break; + case VMCS_INVALID_ENCODING: + err = EINVAL; + break; + default: + *retval = vmcs_read(encoding); + break; + } + } + + if (!running) { + vmcs_clear(vmx->vmcs_pa[vcpu]); + } + + return (err); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int running, hostcpu, error; + struct vmx *vmx = arg; + uint64_t *regp; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + /* VMCS access not required for ctx writes */ + if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) { + *regp = val; + return (0); + } + + if (!running) { + vmcs_load(vmx->vmcs_pa[vcpu]); + } + + if (reg == VM_REG_GUEST_INTR_SHADOW) { + if (val != 0) { + /* + * Forcing the vcpu into an interrupt shadow is not + * presently supported. + */ + error = EINVAL; + } else { + uint64_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + error = 0; + } + } else { + uint32_t encoding; + + error = 0; + encoding = vmcs_field_encoding(reg); + switch (encoding) { + case VMCS_GUEST_IA32_EFER: + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode + * guest" bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0) { + uint64_t ctls; + + ctls = vmcs_read(VMCS_ENTRY_CTLS); + if (val & EFER_LMA) { + ctls |= VM_ENTRY_GUEST_LMA; + } else { + ctls &= ~VM_ENTRY_GUEST_LMA; + } + vmcs_write(VMCS_ENTRY_CTLS, ctls); + } + vmcs_write(encoding, val); + break; + case VMCS_GUEST_CR0: + /* + * The guest is not allowed to modify certain bits in + * %cr0 and %cr4. To maintain the illusion of full + * control, they have shadow versions which contain the + * guest-perceived (via reads from the register) values + * as opposed to the guest-effective values. + * + * This is detailed in the SDM: Vol. 3 Ch. 24.6.6. + */ + vmcs_write(VMCS_CR0_SHADOW, val); + vmcs_write(encoding, vmx_fix_cr0(val)); + break; + case VMCS_GUEST_CR4: + /* See above for detail on %cr4 shadowing */ + vmcs_write(VMCS_CR4_SHADOW, val); + vmcs_write(encoding, vmx_fix_cr4(val)); + break; + case VMCS_GUEST_CR3: + vmcs_write(encoding, val); + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + vmx_invvpid(vmx, vcpu, running); + break; + case VMCS_INVALID_ENCODING: + error = EINVAL; + break; + default: + vmcs_write(encoding, val); + break; + } + } + + if (!running) { + vmcs_clear(vmx->vmcs_pa[vcpu]); + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + uint32_t base, limit, access; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + if (!running) { + vmcs_load(vmx->vmcs_pa[vcpu]); + } + + vmcs_seg_desc_encoding(seg, &base, &limit, &access); + desc->base = vmcs_read(base); + desc->limit = vmcs_read(limit); + if (access != VMCS_INVALID_ENCODING) { + desc->access = vmcs_read(access); + } else { + desc->access = 0; + } + + if (!running) { + vmcs_clear(vmx->vmcs_pa[vcpu]); + } + return (0); +} + +static int +vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + uint32_t base, limit, access; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + if (!running) { + vmcs_load(vmx->vmcs_pa[vcpu]); + } + + vmcs_seg_desc_encoding(seg, &base, &limit, &access); + vmcs_write(base, desc->base); + vmcs_write(limit, desc->limit); + if (access != VMCS_INVALID_ENCODING) { + vmcs_write(access, desc->access); + } + + if (!running) { + vmcs_clear(vmx->vmcs_pa[vcpu]); + } + return (0); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; + case VM_CAP_BPT_EXIT: + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + uint32_t baseval, reg, flag; + uint32_t *pptr; + int error; + + error = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + error = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + error = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + error = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + error = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + case VM_CAP_BPT_EXIT: + error = 0; + + /* Don't change the bitmap if we are tracing all exceptions. */ + if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { + pptr = &vmx->cap[vcpu].exc_bitmap; + baseval = *pptr; + flag = (1 << IDT_BP); + reg = VMCS_EXCEPTION_BITMAP; + } + break; + default: + break; + } + + if (error != 0) { + return (error); + } + + if (pptr != NULL) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + vmcs_load(vmx->vmcs_pa[vcpu]); + vmcs_write(reg, baseval); + vmcs_clear(vmx->vmcs_pa[vcpu]); + + /* + * Update optional stored flags, and record + * setting + */ + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + + return (0); +} + +struct vlapic_vtx { + struct vlapic vlapic; + + /* Align to the nearest cacheline */ + uint8_t _pad[64 - (sizeof (struct vlapic) % 64)]; + + /* TMR handling state for posted interrupts */ + uint32_t tmr_active[8]; + uint32_t pending_level[8]; + uint32_t pending_edge[8]; + + struct pir_desc *pir_desc; + struct vmx *vmx; + uint_t pending_prio; + boolean_t tmr_sync; +}; + +CTASSERT((offsetof(struct vlapic_vtx, tmr_active) & 63) == 0); + +#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) + +static vcpu_notify_t +vmx_apicv_set_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint32_t mask, tmrval; + int idx; + vcpu_notify_t notify = VCPU_NOTIFY_NONE; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + idx = vector / 32; + mask = 1UL << (vector % 32); + + /* + * If the currently asserted TMRs do not match the state requested by + * the incoming interrupt, an exit will be required to reconcile those + * bits in the APIC page. This will keep the vLAPIC behavior in line + * with the architecturally defined expectations. + * + * If actors of mixed types (edge and level) are racing against the same + * vector (toggling its TMR bit back and forth), the results could + * inconsistent. Such circumstances are considered a rare edge case and + * are never expected to be found in the wild. + */ + tmrval = atomic_load_acq_int(&vlapic_vtx->tmr_active[idx]); + if (!level) { + if ((tmrval & mask) != 0) { + /* Edge-triggered interrupt needs TMR de-asserted */ + atomic_set_int(&vlapic_vtx->pending_edge[idx], mask); + atomic_store_rel_long(&pir_desc->pending, 1); + return (VCPU_NOTIFY_EXIT); + } + } else { + if ((tmrval & mask) == 0) { + /* Level-triggered interrupt needs TMR asserted */ + atomic_set_int(&vlapic_vtx->pending_level[idx], mask); + atomic_store_rel_long(&pir_desc->pending, 1); + return (VCPU_NOTIFY_EXIT); + } + } + + /* + * If the interrupt request does not require manipulation of the TMRs + * for delivery, set it in PIR descriptor. It cannot be inserted into + * the APIC page while the vCPU might be running. + */ + atomic_set_int(&pir_desc->pir[idx], mask); + + /* + * A notification is required whenever the 'pending' bit makes a + * transition from 0->1. + * + * Even if the 'pending' bit is already asserted, notification about + * the incoming interrupt may still be necessary. For example, if a + * vCPU is HLTed with a high PPR, a low priority interrupt would cause + * the 0->1 'pending' transition with a notification, but the vCPU + * would ignore the interrupt for the time being. The same vCPU would + * need to then be notified if a high-priority interrupt arrived which + * satisfied the PPR. + * + * The priorities of interrupts injected while 'pending' is asserted + * are tracked in a custom bitfield 'pending_prio'. Should the + * to-be-injected interrupt exceed the priorities already present, the + * notification is sent. The priorities recorded in 'pending_prio' are + * cleared whenever the 'pending' bit makes another 0->1 transition. + */ + if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { + notify = VCPU_NOTIFY_APIC; + vlapic_vtx->pending_prio = 0; + } else { + const uint_t old_prio = vlapic_vtx->pending_prio; + const uint_t prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); + + if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { + atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); + notify = VCPU_NOTIFY_APIC; + } + } + + return (notify); +} + +static void +vmx_apicv_accepted(struct vlapic *vlapic, int vector) +{ + /* + * When APICv is enabled for an instance, the traditional interrupt + * injection method (populating ENTRY_INTR_INFO in the VMCS) is not + * used and the CPU does the heavy lifting of virtual interrupt + * delivery. For that reason vmx_intr_accepted() should never be called + * when APICv is enabled. + */ + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_apicv_sync_tmr(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + const uint32_t *tmrs; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + tmrs = &vlapic_vtx->tmr_active[0]; + + if (!vlapic_vtx->tmr_sync) { + return; + } + + vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)tmrs[1] << 32) | tmrs[0]); + vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)tmrs[3] << 32) | tmrs[2]); + vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)tmrs[5] << 32) | tmrs[4]); + vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)tmrs[7] << 32) | tmrs[6]); + vlapic_vtx->tmr_sync = B_FALSE; +} + +static void +vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) +{ + struct vmx *vmx; + uint32_t proc_ctls; + int vcpuid; + + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + + proc_ctls = vmx->cap[vcpuid].proc_ctls; + proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; + proc_ctls |= PROCBASED_CR8_LOAD_EXITING; + proc_ctls |= PROCBASED_CR8_STORE_EXITING; + vmx->cap[vcpuid].proc_ctls = proc_ctls; + + vmcs_load(vmx->vmcs_pa[vcpuid]); + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); + vmcs_clear(vmx->vmcs_pa[vcpuid]); +} + +static void +vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) +{ + struct vmx *vmx; + uint32_t proc_ctls2; + int vcpuid; + + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; + + vmcs_load(vmx->vmcs_pa[vcpuid]); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); + vmcs_clear(vmx->vmcs_pa[vcpuid]); + + vmx_allow_x2apic_msrs(vmx, vcpuid); +} + +static void +vmx_apicv_notify(struct vlapic *vlapic, int hostcpu) +{ + psm_send_pir_ipi(hostcpu); +} + +static void +vmx_apicv_sync(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint_t i; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + lapic = vlapic->apic_page; + + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + return; + } + + vlapic_vtx->pending_prio = 0; + + /* Make sure the invalid (0-15) vectors are not set */ + ASSERT0(vlapic_vtx->pending_level[0] & 0xffff); + ASSERT0(vlapic_vtx->pending_edge[0] & 0xffff); + ASSERT0(pir_desc->pir[0] & 0xffff); + + for (i = 0; i <= 7; i++) { + uint32_t *tmrp = &lapic->tmr0 + (i * 4); + uint32_t *irrp = &lapic->irr0 + (i * 4); + + const uint32_t pending_level = + atomic_readandclear_int(&vlapic_vtx->pending_level[i]); + const uint32_t pending_edge = + atomic_readandclear_int(&vlapic_vtx->pending_edge[i]); + const uint32_t pending_inject = + atomic_readandclear_int(&pir_desc->pir[i]); + + if (pending_level != 0) { + /* + * Level-triggered interrupts assert their corresponding + * bit in the TMR when queued in IRR. + */ + *tmrp |= pending_level; + *irrp |= pending_level; + } + if (pending_edge != 0) { + /* + * When queuing an edge-triggered interrupt in IRR, the + * corresponding bit in the TMR is cleared. + */ + *tmrp &= ~pending_edge; + *irrp |= pending_edge; + } + if (pending_inject != 0) { + /* + * Interrupts which do not require a change to the TMR + * (because it already matches the necessary state) can + * simply be queued in IRR. + */ + *irrp |= pending_inject; + } + + if (*tmrp != vlapic_vtx->tmr_active[i]) { + /* Check if VMX EOI triggers require updating. */ + vlapic_vtx->tmr_active[i] = *tmrp; + vlapic_vtx->tmr_sync = B_TRUE; + } + } +} + +static void +vmx_tpr_shadow_enter(struct vlapic *vlapic) +{ + /* + * When TPR shadowing is enabled, VMX will initiate a guest exit if its + * TPR falls below a threshold priority. That threshold is set to the + * current TPR priority, since guest interrupt status should be + * re-evaluated if its TPR is set lower. + */ + vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); +} + +static void +vmx_tpr_shadow_exit(struct vlapic *vlapic) +{ + /* + * Unlike full APICv, where changes to the TPR are reflected in the PPR, + * with TPR shadowing, that duty is relegated to the VMM. Upon exit, + * the PPR is updated to reflect any change in the TPR here. + */ + vlapic_sync_tpr(vlapic); +} + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof (struct vlapic_vtx), M_VLAPIC, + M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; + vlapic_vtx->vmx = vmx; + + if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) { + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; + } + if (vmx_cap_en(vmx, VMX_CAP_APICV)) { + vlapic->ops.set_intr_ready = vmx_apicv_set_ready; + vlapic->ops.sync_state = vmx_apicv_sync; + vlapic->ops.intr_accepted = vmx_apicv_accepted; + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; + + if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) { + vlapic->ops.post_intr = vmx_apicv_notify; + } + } + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + +static void +vmx_savectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + vmcs_clear(vmx->vmcs_pa[vcpu]); + vmx_msr_guest_exit(vmx, vcpu); + /* + * Having VMCLEARed the VMCS, it can no longer be re-entered + * with VMRESUME, but must be VMLAUNCHed again. + */ + vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED; + } + + reset_gdtr_limit(); +} + +static void +vmx_restorectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + + ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED); + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + vmx_msr_guest_enter(vmx, vcpu); + vmcs_load(vmx->vmcs_pa[vcpu]); + } +} + +struct vmm_ops vmm_ops_intel = { + .init = vmx_init, + .cleanup = vmx_cleanup, + .resume = vmx_restore, + + .vminit = vmx_vminit, + .vmrun = vmx_run, + .vmcleanup = vmx_vmcleanup, + .vmgetreg = vmx_getreg, + .vmsetreg = vmx_setreg, + .vmgetdesc = vmx_getdesc, + .vmsetdesc = vmx_setdesc, + .vmgetcap = vmx_getcap, + .vmsetcap = vmx_setcap, + .vlapic_init = vmx_vlapic_init, + .vlapic_cleanup = vmx_vlapic_cleanup, + + .vmsavectx = vmx_savectx, + .vmrestorectx = vmx_restorectx, +}; + +/* Side-effect free HW validation derived from checks in vmx_init. */ +int +vmx_x86_supported(const char **msg) +{ + int error; + uint32_t tmp; + + ASSERT(msg != NULL); + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired primary " + "processor-based controls"; + return (error); + } + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired secondary " + "processor-based controls"; + return (error); + } + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired pin-based controls"; + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired exit controls"; + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired entry controls"; + return (error); + } + + /* Unrestricted guest is nominally optional, but not for us. */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp); + if (error) { + *msg = "processor does not support desired unrestricted guest " + "controls"; + return (error); + } + + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/intel/vmx.h b/usr/src/uts/intel/io/vmm/intel/vmx.h new file mode 100644 index 0000000000..197ca1341d --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx.h @@ -0,0 +1,201 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +struct vmxctx { + uint64_t guest_rdi; /* Guest state */ + uint64_t guest_rsi; + uint64_t guest_rdx; + uint64_t guest_rcx; + uint64_t guest_r8; + uint64_t guest_r9; + uint64_t guest_rax; + uint64_t guest_rbx; + uint64_t guest_rbp; + uint64_t guest_r10; + uint64_t guest_r11; + uint64_t guest_r12; + uint64_t guest_r13; + uint64_t guest_r14; + uint64_t guest_r15; + uint64_t guest_cr2; + uint64_t guest_dr0; + uint64_t guest_dr1; + uint64_t guest_dr2; + uint64_t guest_dr3; + uint64_t guest_dr6; + + uint64_t host_dr0; + uint64_t host_dr1; + uint64_t host_dr2; + uint64_t host_dr3; + uint64_t host_dr6; + uint64_t host_dr7; + uint64_t host_debugctl; + int host_tf; + + int inst_fail_status; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; + uint32_t exc_bitmap; +}; + +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof (struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint32_t pir[8]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof (struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + IDX_MSR_PAT, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +typedef enum { + VS_NONE = 0x0, + VS_LAUNCHED = 0x1, + VS_LOADED = 0x2 +} vmcs_state_t; + +/* virtual machine softc */ +struct vmx { + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ + uint8_t *msr_bitmap[VM_MAXCPU]; /* one MSR bitmap per vCPU */ + struct pir_desc pir_desc[VM_MAXCPU]; + uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + uint64_t host_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + uint64_t tsc_offset_active[VM_MAXCPU]; + vmcs_state_t vmcs_state[VM_MAXCPU]; + uintptr_t vmcs_pa[VM_MAXCPU]; + void *apic_access_page; + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; + enum vmx_caps vmx_caps; + struct vm *vm; + /* + * Track the latest vmspace generation as it is run on a given host CPU. + * This allows us to react to modifications to the vmspace (such as + * unmap or changed protection) which necessitate flushing any + * guest-physical TLB entries tagged for this guest via 'invept'. + */ + uint64_t eptgen[MAXCPU]; +}; +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); + +static __inline bool +vmx_cap_en(const struct vmx *vmx, enum vmx_caps cap) +{ + return ((vmx->vmx_caps & cap) == cap); +} + + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE_ASM \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" + + +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +#define VMX_VMWRITE_ERROR 4 + +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); + +int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); + +extern char vmx_exit_guest[]; +extern char vmx_exit_guest_flush_rsb[]; + +#endif diff --git a/usr/src/uts/intel/io/vmm/intel/vmx_controls.h b/usr/src/uts/intel/io/vmm/intel/vmx_controls.h new file mode 100644 index 0000000000..ae6ff9b5aa --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx_controls.h @@ -0,0 +1,98 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/usr/src/uts/intel/io/vmm/intel/vmx_msr.c b/usr/src/uts/intel/io/vmm/intel/vmx_msr.c new file mode 100644 index 0000000000..f9c292f659 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx_msr.c @@ -0,0 +1,496 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2020 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <sys/vmm_kernel.h> + +#include "vmx.h" +#include "vmx_msr.h" + +static bool +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + return ((msr_val & (1UL << (bitpos + 32))) != 0); +} + +static bool +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + return ((msr_val & (1UL << bitpos)) == 0); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + bool true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) { + /* b(ii),c(ii) */ + *retval &= ~(1 << i); + } else if (ones_mask & (1 << i)) { + /* b(ii), c(ii) */ + *retval |= 1 << i; + } else if (!true_ctls_avail) { + /* b(iii) */ + *retval &= ~(1 << i); + } else if (vmx_ctl_allows_zero_setting(val, i)) { + /* c(iii) */ + *retval &= ~(1 << i); + } else if (vmx_ctl_allows_one_setting(val, i)) { + /* c(iv) */ + *retval |= 1 << i; + } else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +vmx_msr_bitmap_initialize(struct vmx *vmx) +{ + for (uint_t i = 0; i < VM_MAXCPU; i++) { + uint8_t *bitmap; + + bitmap = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0); + memset(bitmap, 0xff, PAGESIZE); + + vmx->msr_bitmap[i] = bitmap; + } +} + +void +vmx_msr_bitmap_destroy(struct vmx *vmx) +{ + for (uint_t i = 0; i < VM_MAXCPU; i++) { + VERIFY3P(vmx->msr_bitmap[i], !=, NULL); + kmem_free(vmx->msr_bitmap[i], PAGESIZE); + vmx->msr_bitmap[i] = NULL; + } +} + +void +vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc) +{ + uint8_t *bitmap = vmx->msr_bitmap[vcpuid]; + int byte, bit; + + if (msr <= 0x00001FFF) { + byte = msr / 8; + } else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) { + byte = 1024 + (msr - 0xC0000000) / 8; + } else { + panic("Invalid MSR for bitmap: %x", msr); + } + + bit = msr & 0x7; + + if (acc & MSR_BITMAP_ACCESS_READ) { + bitmap[byte] &= ~(1 << bit); + } else { + bitmap[byte] |= 1 << bit; + } + + byte += 2048; + if (acc & MSR_BITMAP_ACCESS_WRITE) { + bitmap[byte] &= ~(1 << bit); + } else { + bitmap[byte] |= 1 << bit; + } +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; + +static bool +nehalem_cpu(void) +{ + uint_t family, model; + + /* + * The family:model numbers belonging to the Nehalem microarchitecture + * are documented in Section 35.5, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x1A: + case 0x1E: + case 0x1F: + case 0x2E: + return (true); + default: + break; + } + } + return (false); +} + +static bool +westmere_cpu(void) +{ + uint_t family, model; + + /* + * The family:model numbers belonging to the Westmere microarchitecture + * are documented in Section 35.6, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x25: + case 0x2C: + return (true); + default: + break; + } + } + return (false); +} + +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + +void +vmx_msr_init(void) +{ + uint64_t bus_freq, ratio; + int i; + + /* + * Initialize emulated MSRs + */ + misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1 << 12) | (1 << 11); + misc_enable &= ~((1 << 18) | (1 << 16)); + + if (nehalem_cpu() || westmere_cpu()) + bus_freq = 133330000; /* 133Mhz */ + else + bus_freq = 100000000; /* 100Mhz */ + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; +} + +void +vmx_msr_guest_init(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + /* + * It is safe to allow direct access to MSR_GSBASE and + * MSR_FSBASE. The guest FSBASE and GSBASE are saved and + * restored during vm-exit and vm-entry respectively. The host + * FSBASE and GSBASE are always restored from the vmcs host + * state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by + * the guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a VM + * exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as + * that will impact the host TSC. If the guest does a write the + * "use TSC offsetting" execution control is enabled and the + * difference between the host TSC and the guest TSC is written + * into the TSC offset in the VMCS. + */ + guest_msr_rw(vmx, vcpuid, MSR_GSBASE); + guest_msr_rw(vmx, vcpuid, MSR_FSBASE); + guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR); + guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR); + guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR); + guest_msr_rw(vmx, vcpuid, MSR_EFER); + guest_msr_ro(vmx, vcpuid, MSR_TSC); + + /* + * The guest may have direct access to these MSRs as they are + * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit(). + */ + guest_msr_rw(vmx, vcpuid, MSR_LSTAR); + guest_msr_rw(vmx, vcpuid, MSR_CSTAR); + guest_msr_rw(vmx, vcpuid, MSR_STAR); + guest_msr_rw(vmx, vcpuid, MSR_SF_MASK); + guest_msr_rw(vmx, vcpuid, MSR_KGSBASE); + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); +} + +void +vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + + /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ + wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); + wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); +} + +void +vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; + + /* Save guest MSRs */ + guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); + + /* Restore host MSRs */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +vm_msr_result_t +vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val) +{ + const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + switch (num) { + case MSR_IA32_FEATURE_CONTROL: + /* + * We currently don't support SGX support in guests, so + * always report those features as disabled with the MSR + * locked so the guest won't attempt to write to it. + */ + *val = IA32_FEATURE_CONTROL_LOCK; + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; + default: + return (VMR_UNHANLDED); + } + return (VMR_OK); +} + +vm_msr_result_t +vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + uint64_t changed; + + switch (num) { + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) { + return (VMR_UNHANLDED); + } + break; + case MSR_PAT: + if (!pat_valid(val)) { + return (VMR_GP); + } + guest_msrs[IDX_MSR_PAT] = val; + break; + default: + return (VMR_UNHANLDED); + } + + return (VMR_OK); +} diff --git a/usr/src/uts/intel/io/vmm/intel/vmx_msr.h b/usr/src/uts/intel/io/vmm/intel/vmx_msr.h new file mode 100644 index 0000000000..551f2d659a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx_msr.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid); +vm_msr_result_t vmx_rdmsr(struct vmx *, int, uint32_t, uint64_t *); +vm_msr_result_t vmx_wrmsr(struct vmx *, int, uint32_t, uint64_t); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void vmx_msr_bitmap_initialize(struct vmx *); +void vmx_msr_bitmap_destroy(struct vmx *); +void vmx_msr_bitmap_change_access(struct vmx *, int, uint_t, int); + +#define guest_msr_rw(vmx, vcpuid, msr) \ + vmx_msr_bitmap_change_access((vmx), (vcpuid), (msr), MSR_BITMAP_ACCESS_RW) + +#define guest_msr_ro(vmx, vcpuid, msr) \ + vmx_msr_bitmap_change_access((vmx), (vcpuid), (msr), MSR_BITMAP_ACCESS_READ) + +#endif diff --git a/usr/src/uts/intel/io/vmm/intel/vmx_support.s b/usr/src/uts/intel/io/vmm/intel/vmx_support.s new file mode 100644 index 0000000000..60f761d652 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vmx_support.s @@ -0,0 +1,309 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* Porting note: This is named 'vmx_support.S' upstream. */ + +#include "vmx_assym.h" +#include "vmcs.h" + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. + */ +/* BEGIN CSTYLED */ +#define VMX_GUEST_RESTORE \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VMX_GUEST_SAVE \ + movq %rdi, VMXSTK_TMPRDI(%rsp); \ + movq VMXSTK_RDI(%rsp), %rdi; \ + movq %rbp, VMXCTX_GUEST_RBP(%rdi); \ + leaq VMXSTK_FP(%rsp), %rbp; \ + movq %rsi, VMXCTX_GUEST_RSI(%rdi); \ + movq %rdx, VMXCTX_GUEST_RDX(%rdi); \ + movq %rcx, VMXCTX_GUEST_RCX(%rdi); \ + movq %r8, VMXCTX_GUEST_R8(%rdi); \ + movq %r9, VMXCTX_GUEST_R9(%rdi); \ + movq %rax, VMXCTX_GUEST_RAX(%rdi); \ + movq %rbx, VMXCTX_GUEST_RBX(%rdi); \ + movq %r10, VMXCTX_GUEST_R10(%rdi); \ + movq %r11, VMXCTX_GUEST_R11(%rdi); \ + movq %r12, VMXCTX_GUEST_R12(%rdi); \ + movq %r13, VMXCTX_GUEST_R13(%rdi); \ + movq %r14, VMXCTX_GUEST_R14(%rdi); \ + movq %r15, VMXCTX_GUEST_R15(%rdi); \ + movq %cr2, %rbx; \ + movq %rbx, VMXCTX_GUEST_CR2(%rdi); \ + movq VMXSTK_TMPRDI(%rsp), %rdx; \ + movq %rdx, VMXCTX_GUEST_RDI(%rdi); +/* END CSTYLED */ + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define VMX_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + + +/* Stack layout (offset from %rsp) for vmx_enter_guest */ +#define VMXSTK_TMPRDI 0x00 /* temp store %rdi on vmexit */ +#define VMXSTK_R15 0x08 /* callee saved %r15 */ +#define VMXSTK_R14 0x10 /* callee saved %r14 */ +#define VMXSTK_R13 0x18 /* callee saved %r13 */ +#define VMXSTK_R12 0x20 /* callee saved %r12 */ +#define VMXSTK_RBX 0x28 /* callee saved %rbx */ +#define VMXSTK_RDX 0x30 /* save-args %rdx (int launched) */ +#define VMXSTK_RSI 0x38 /* save-args %rsi (struct vmx *vmx) */ +#define VMXSTK_RDI 0x40 /* save-args %rdi (struct vmxctx *ctx) */ +#define VMXSTK_FP 0x48 /* frame pointer %rbp */ +#define VMXSTKSIZE VMXSTK_FP + +/* + * vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched) + * Interrupts must be disabled on entry. + */ +ENTRY_NP(vmx_enter_guest) + pushq %rbp + movq %rsp, %rbp + subq $VMXSTKSIZE, %rsp + movq %r15, VMXSTK_R15(%rsp) + movq %r14, VMXSTK_R14(%rsp) + movq %r13, VMXSTK_R13(%rsp) + movq %r12, VMXSTK_R12(%rsp) + movq %rbx, VMXSTK_RBX(%rsp) + movq %rdx, VMXSTK_RDX(%rsp) + movq %rsi, VMXSTK_RSI(%rsp) + movq %rdi, VMXSTK_RDI(%rsp) + + movq %rdi, %r12 /* vmxctx */ + movq %rsi, %r13 /* vmx */ + movl %edx, %r14d /* launch state */ + + /* Write the current %rsp into the VMCS to be restored on vmexit */ + movl $VMCS_HOST_RSP, %eax + vmwrite %rsp, %rax + jbe vmwrite_error + + /* Check if vmresume is adequate or a full vmlaunch is required */ + cmpl $0, %r14d + je do_launch + + VMX_GUEST_RESTORE + vmresume + /* + * In the common case, 'vmresume' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMRESUME_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error + +do_launch: + VMX_GUEST_RESTORE + vmlaunch + /* + * In the common case, 'vmlaunch' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMLAUNCH_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMLAUNCH_ERROR, %eax + jmp decode_inst_error + +vmwrite_error: + movl $VMX_VMWRITE_ERROR, %eax + jmp decode_inst_error +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + jz inst_error + movl $VM_FAIL_INVALID, %r11d +inst_error: + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) + + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret + +/* + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx + */ +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_enter_guest) + + + +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest_flush_rsb) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + VMX_GUEST_FLUSH_SCRATCH + + /* + * To prevent malicious branch target predictions from affecting the + * host, overwrite all entries in the RSB upon exiting a guest. + */ + movl $16, %ecx /* 16 iterations, two calls per loop */ + movq %rsp, %rax +loop: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + subl $1, %ecx + jnz loop + movq %rax, %rsp + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_exit_guest_flush_rsb) + +/* + * %rdi = trapno + * + * We need to do enough to convince cmnint - and its iretting tail - that we're + * a legit interrupt stack frame. + */ +ENTRY_NP(vmx_call_isr) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + pushq $0 /* err */ + pushq %rdi /* trapno */ + cli + jmp cmnint /* %rip (and call) */ +.iret_dest: + popq %rbp + ret +SET_SIZE(vmx_call_isr) diff --git a/usr/src/uts/intel/io/vmm/intel/vtd.c b/usr/src/uts/intel/io/vmm/intel/vtd.c new file mode 100644 index 0000000000..d32143aa07 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vtd.c @@ -0,0 +1,877 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <dev/pci/pcireg.h> + +#include <machine/vmparam.h> +#include <sys/vmm_vm.h> + +#include <contrib/dev/acpica/include/acpi.h> + +#include <sys/sunndi.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +#define VTD_DRHD_INCLUDE_PCI_ALL(Flags) (((Flags) >> 0) & 0x1) + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + uint_t id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static ACPI_DMAR_HARDWARE_UNIT *drhds[DRHD_MAX_UNITS]; +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); +#ifndef __FreeBSD__ +static dev_info_t *vtddips[DRHD_MAX_UNITS]; +#endif + +static uint64_t root_table[PAGE_SIZE / sizeof (uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof (uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static uint_t +domain_id(void) +{ + uint_t id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static struct vtdmap * +vtd_device_scope(uint16_t rid) +{ + int i, remaining, pathrem; + char *end, *pathend; + struct vtdmap *vtdmap; + ACPI_DMAR_HARDWARE_UNIT *drhd; + ACPI_DMAR_DEVICE_SCOPE *device_scope; + ACPI_DMAR_PCI_PATH *path; + + for (i = 0; i < drhd_num; i++) { + drhd = drhds[i]; + + if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) { + /* + * From Intel VT-d arch spec, version 3.0: + * If a DRHD structure with INCLUDE_PCI_ALL flag Set is + * reported for a Segment, it must be enumerated by BIOS + * after all other DRHD structures for the same Segment. + */ + vtdmap = vtdmaps[i]; + return (vtdmap); + } + + end = (char *)drhd + drhd->Header.Length; + remaining = drhd->Header.Length - + sizeof (ACPI_DMAR_HARDWARE_UNIT); + while (remaining > sizeof (ACPI_DMAR_DEVICE_SCOPE)) { + device_scope = + (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining); + remaining -= device_scope->Length; + + switch (device_scope->EntryType) { + /* 0x01 and 0x02 are PCI device entries */ + case 0x01: + case 0x02: + break; + default: + continue; + } + + if (PCI_RID2BUS(rid) != device_scope->Bus) + continue; + + pathend = (char *)device_scope + device_scope->Length; + pathrem = device_scope->Length - + sizeof (ACPI_DMAR_DEVICE_SCOPE); + while (pathrem >= sizeof (ACPI_DMAR_PCI_PATH)) { + path = (ACPI_DMAR_PCI_PATH *) + (pathend - pathrem); + pathrem -= sizeof (ACPI_DMAR_PCI_PATH); + + if (PCI_RID2SLOT(rid) != path->Device) + continue; + if (PCI_RID2FUNC(rid) != path->Function) + continue; + + vtdmap = vtdmaps[i]; + return (vtdmap); + } + } + } + + /* No matching scope */ + return (NULL); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + invalidate_cache_all(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static void * +vtd_map(dev_info_t *dip) +{ + caddr_t regs; + ddi_acc_handle_t hdl; + int error; + + static ddi_device_acc_attr_t regs_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC, + }; + + error = ddi_regs_map_setup(dip, 0, ®s, 0, PAGE_SIZE, ®s_attr, + &hdl); + + if (error != DDI_SUCCESS) + return (NULL); + + ddi_set_driver_private(dip, hdl); + + return (regs); +} + +static void +vtd_unmap(dev_info_t *dip) +{ + ddi_acc_handle_t hdl = ddi_get_driver_private(dip); + + if (hdl != NULL) + ddi_regs_map_free(&hdl); +} + +#ifndef __FreeBSD__ +/* + * This lives in vtd_sol.c for license reasons. + */ +extern dev_info_t *vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *, int); +#endif + +static int +vtd_init(void) +{ + int i, units, remaining, tmp; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end; +#ifdef __FreeBSD__ + char envname[32]; + unsigned long mapaddr; +#endif + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + +#ifdef __FreeBSD__ + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof (envname), "vtd.regmap.%d.addr", + units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; +#else + units = 0; +#endif + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof (ACPI_TABLE_DMAR); + while (remaining > sizeof (ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; + drhds[units] = drhd; +#ifdef __FreeBSD__ + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); +#else + vtddips[units] = vtd_get_dip(drhd, units); + vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]); + if (vtdmaps[units] == NULL) + goto fail; +#endif + if (++units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +#ifdef __FreeBSD__ +skip_dmar: +#endif + drhd_num = units; + + max_domains = 64 * 1024; /* maximum valid value */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + /* take most compatible (minimum) value */ + if ((tmp = vtd_max_domains(vtdmap)) < max_domains) + max_domains = tmp; + } + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); + +#ifndef __FreeBSD__ +fail: + for (i = 0; i <= units; i++) + vtd_unmap(vtddips[i]); + return (ENXIO); +#endif +} + +static void +vtd_cleanup(void) +{ +#ifndef __FreeBSD__ + int i; + + KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty")); + + bzero(root_table, sizeof (root_table)); + + for (i = 0; i <= drhd_num; i++) { + vtdmaps[i] = NULL; + /* + * Unmap the vtd registers. Note that the devinfo nodes + * themselves aren't removed, they are considered system state + * and can be reused when the module is reloaded. + */ + if (vtddips[i] != NULL) + vtd_unmap(vtddips[i]); + } +#endif +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8)); + } + + if ((vtdmap = vtd_device_scope(rid)) == NULL) + panic("vtd_add_device: device %x is not in scope for " + "any DMA remapping unit", rid); + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %lx/%lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %lx/%lx beyond " + "domain maxaddr %lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accommodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + + tmp = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + tmp &= VTD_CAP_SAGAW(vtdmap->cap); + } + + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d", + tmp, agaw); + } + + dom = malloc(sizeof (struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef __FreeBSD__ +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + dom->spsmask &= VTD_CAP_SPS(vtdmap->cap); + } +#endif +#else + /* + * On illumos we decidedly do not remove memory mapped to a VM's domain + * from the host_domain, so we don't have to deal with page demotion and + * can just use large pages. + * + * Since VM memory is currently allocated as 4k pages and mapped into + * the VM domain page by page, the use of large pages is essentially + * limited to the host_domain. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +const struct iommu_ops iommu_ops_intel = { + .init = vtd_init, + .cleanup = vtd_cleanup, + .enable = vtd_enable, + .disable = vtd_disable, + .create_domain = vtd_create_domain, + .destroy_domain = vtd_destroy_domain, + .create_mapping = vtd_create_mapping, + .remove_mapping = vtd_remove_mapping, + .add_device = vtd_add_device, + .remove_device = vtd_remove_device, + .invalidate_tlb = vtd_invalidate_tlb, +}; diff --git a/usr/src/uts/intel/io/vmm/intel/vtd_sol.c b/usr/src/uts/intel/io/vmm/intel/vtd_sol.c new file mode 100644 index 0000000000..26c6c5b024 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/intel/vtd_sol.c @@ -0,0 +1,84 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/sunndi.h> +#include <contrib/dev/acpica/include/acpi.h> + +dev_info_t * +vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *drhd, int unit) +{ + dev_info_t *dip; + struct ddi_parent_private_data *pdptr; + struct regspec reg; + int circ; + + /* + * Try to find an existing devinfo node for this vtd unit. + */ + ndi_devi_enter(ddi_root_node(), &circ); + dip = ddi_find_devinfo("vtd", unit, 0); + ndi_devi_exit(ddi_root_node(), circ); + + if (dip != NULL) + return (dip); + + /* + * None found, construct a devinfo node for this vtd unit. + */ + dip = ddi_add_child(ddi_root_node(), "vtd", + DEVI_SID_NODEID, unit); + + reg.regspec_bustype = 0; + reg.regspec_addr = drhd->Address; + reg.regspec_size = PAGE_SIZE; + + /* + * update the reg properties + * + * reg property will be used for register + * set access + * + * refer to the bus_map of root nexus driver + * I/O or memory mapping: + * + * <bustype=0, addr=x, len=x>: memory + * <bustype=1, addr=x, len=x>: i/o + * <bustype>1, addr=0, len=x>: x86-compatibility i/o + */ + (void) ndi_prop_update_int_array(DDI_DEV_T_NONE, + dip, "reg", (int *)®, + sizeof (struct regspec) / sizeof (int)); + + /* + * This is an artificially constructed dev_info, and we + * need to set a few more things to be able to use it + * for ddi_dma_alloc_handle/free_handle. + */ + ddi_set_driver(dip, ddi_get_driver(ddi_root_node())); + DEVI(dip)->devi_bus_dma_allochdl = + DEVI(ddi_get_driver((ddi_root_node()))); + + pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data) + + sizeof (struct regspec), KM_SLEEP); + pdptr->par_nreg = 1; + pdptr->par_reg = (struct regspec *)(pdptr + 1); + pdptr->par_reg->regspec_bustype = 0; + pdptr->par_reg->regspec_addr = drhd->Address; + pdptr->par_reg->regspec_size = PAGE_SIZE; + ddi_set_parent_data(dip, pdptr); + + return (dip); +} diff --git a/usr/src/uts/intel/io/vmm/io/iommu.c b/usr/src/uts/intel/io/vmm/io/iommu.c new file mode 100644 index 0000000000..3ebd394b99 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/iommu.c @@ -0,0 +1,349 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/cpu.h> +#include <machine/md_var.h> + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "vmm_util.h" +#include "iommu.h" + +static int iommu_avail; + +static int iommu_enable = 1; + +static const struct iommu_ops *ops; +static void *host_domain; +#ifdef __FreeBSD__ +static eventhandler_tag add_tag, delete_tag; +#endif + +#ifndef __FreeBSD__ +static volatile uint_t iommu_initted; +#endif + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline uint64_t +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_mapping)(domain, gpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, rid); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, rid); +} + +static __inline void +IOMMU_INVALIDATE_TLB(void *domain) +{ + + if (ops != NULL && iommu_avail) + (*ops->invalidate_tlb)(domain); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +#ifdef __FreeBSD__ +static void +iommu_pci_add(void *arg, device_t dev) +{ + + /* Add new devices to the host domain. */ + iommu_add_device(host_domain, pci_get_rid(dev)); +} + +static void +iommu_pci_delete(void *arg, device_t dev) +{ + + iommu_remove_device(host_domain, pci_get_rid(dev)); +} +#endif + +#ifndef __FreeBSD__ +static int +iommu_find_device(dev_info_t *dip, void *arg) +{ + boolean_t add = (boolean_t)arg; + + if (pcie_is_pci_device(dip)) { + if (add) + iommu_add_device(host_domain, pci_get_rid(dip)); + else + iommu_remove_device(host_domain, pci_get_rid(dip)); + } + + return (DDI_WALK_CONTINUE); +} + +static vm_paddr_t +vmm_mem_maxaddr(void) +{ + return (ptoa(physmax + 1)); +} +#endif + +static void +iommu_init(void) +{ + int error; + vm_paddr_t maxaddr; + + if (!iommu_enable) + return; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_svm()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = 1; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = vmm_mem_maxaddr(); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) { + printf("iommu_init: unable to create a host domain"); + IOMMU_CLEANUP(); + ops = NULL; + iommu_avail = 0; + return; + } + + /* + * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_TRUE); + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ +#ifdef __FreeBSD__ + if (add_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_add_device, add_tag); + add_tag = NULL; + } + if (delete_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag); + delete_tag = NULL; + } +#else + atomic_store_rel_int(&iommu_initted, 0); +#endif + IOMMU_DISABLE(); +#ifndef __FreeBSD__ + ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_FALSE); +#endif + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +#ifndef __FreeBSD__ + ops = NULL; +#endif +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + if (iommu_initted < 2) { + if (atomic_cmpset_int(&iommu_initted, 0, 1)) { + iommu_init(); + atomic_store_rel_int(&iommu_initted, 2); + } else + while (iommu_initted == 1) + cpu_spinwait(); + } + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + uint64_t unmapped, remaining; + + remaining = len; + + while (remaining > 0) { + unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining); + gpa += unmapped; + remaining -= unmapped; + } +} + +void * +iommu_host_domain(void) +{ + + return (host_domain); +} + +void +iommu_add_device(void *dom, uint16_t rid) +{ + + IOMMU_ADD_DEVICE(dom, rid); +} + +void +iommu_remove_device(void *dom, uint16_t rid) +{ + + IOMMU_REMOVE_DEVICE(dom, rid); +} + +void +iommu_invalidate_tlb(void *domain) +{ + + IOMMU_INVALIDATE_TLB(domain); +} diff --git a/usr/src/uts/intel/io/vmm/io/iommu.h b/usr/src/uts/intel/io/vmm/io/iommu.h new file mode 100644 index 0000000000..a5e9448b4e --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/iommu.h @@ -0,0 +1,76 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); +typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); +typedef void (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern const struct iommu_ops iommu_ops_intel; +extern const struct iommu_ops iommu_ops_amd; + +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +void iommu_add_device(void *dom, uint16_t rid); +void iommu_remove_device(void *dom, uint16_t rid); +void iommu_invalidate_tlb(void *domain); +#endif diff --git a/usr/src/uts/intel/io/vmm/io/ppt.c b/usr/src/uts/intel/io/vmm/io/ppt.c new file mode 100644 index 0000000000..e79842d8a8 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/ppt.c @@ -0,0 +1,1490 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/pciio.h> +#include <sys/sysctl.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/pci.h> +#include <sys/pci_cap.h> +#include <sys/pcie_impl.h> +#include <sys/ppt_dev.h> +#include <sys/mkdev.h> +#include <sys/sysmacros.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +#define MAX_MSIMSGS 32 + +/* + * If the MSI-X table is located in the middle of a BAR then that MMIO + * region gets split into two segments - one segment above the MSI-X table + * and the other segment below the MSI-X table - with a hole in place of + * the MSI-X table so accesses to it can be trapped and emulated. + * + * So, allocate a MMIO segment for each BAR register + 1 additional segment. + */ +#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) + +struct pptintr_arg { + struct pptdev *pptdev; + uint64_t addr; + uint64_t msg_data; +}; + +struct pptseg { + vm_paddr_t gpa; + size_t len; + int wired; +}; + +struct pptbar { + uint64_t base; + uint64_t size; + uint_t type; + ddi_acc_handle_t io_handle; + caddr_t io_ptr; + uint_t ddireg; +}; + +struct pptdev { + dev_info_t *pptd_dip; + list_node_t pptd_node; + ddi_acc_handle_t pptd_cfg; + struct pptbar pptd_bars[PCI_BASE_NUM]; + struct vm *vm; + struct pptseg mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + boolean_t is_fixed; + size_t inth_sz; + ddi_intr_handle_t *inth; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; + + struct { + int num_msgs; + size_t inth_sz; + size_t arg_sz; + ddi_intr_handle_t *inth; + struct pptintr_arg *arg; + } msix; +}; + + +static major_t ppt_major; +static void *ppt_state; +static kmutex_t pptdev_mtx; +static list_t pptdev_list; + +#define PPT_MINOR_NAME "ppt" + +static ddi_device_acc_attr_t ppt_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STORECACHING_OK_ACC, + DDI_DEFAULT_ACC +}; + +static int +ppt_open(dev_t *devp, int flag, int otyp, cred_t *cr) +{ + /* XXX: require extra privs? */ + return (0); +} + +#define BAR_TO_IDX(bar) (((bar) - PCI_CONF_BASE0) / PCI_BAR_SZ_32) +#define BAR_VALID(b) ( \ + (b) >= PCI_CONF_BASE0 && \ + (b) <= PCI_CONF_BASE5 && \ + ((b) & (PCI_BAR_SZ_32-1)) == 0) + +static int +ppt_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + minor_t minor = getminor(dev); + struct pptdev *ppt; + void *data = (void *)arg; + + if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) { + return (ENOENT); + } + + switch (cmd) { + case PPT_CFG_READ: { + struct ppt_cfg_io cio; + ddi_acc_handle_t cfg = ppt->pptd_cfg; + + if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) { + return (EFAULT); + } + switch (cio.pci_width) { + case 4: + cio.pci_data = pci_config_get32(cfg, cio.pci_off); + break; + case 2: + cio.pci_data = pci_config_get16(cfg, cio.pci_off); + break; + case 1: + cio.pci_data = pci_config_get8(cfg, cio.pci_off); + break; + default: + return (EINVAL); + } + + if (ddi_copyout(&cio, data, sizeof (cio), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_CFG_WRITE: { + struct ppt_cfg_io cio; + ddi_acc_handle_t cfg = ppt->pptd_cfg; + + if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) { + return (EFAULT); + } + switch (cio.pci_width) { + case 4: + pci_config_put32(cfg, cio.pci_off, cio.pci_data); + break; + case 2: + pci_config_put16(cfg, cio.pci_off, cio.pci_data); + break; + case 1: + pci_config_put8(cfg, cio.pci_off, cio.pci_data); + break; + default: + return (EINVAL); + } + + return (0); + } + case PPT_BAR_QUERY: { + struct ppt_bar_query barg; + struct pptbar *pbar; + + if (ddi_copyin(data, &barg, sizeof (barg), md) != 0) { + return (EFAULT); + } + if (barg.pbq_baridx >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[barg.pbq_baridx]; + + if (pbar->base == 0 || pbar->size == 0) { + return (ENOENT); + } + barg.pbq_type = pbar->type; + barg.pbq_base = pbar->base; + barg.pbq_size = pbar->size; + + if (ddi_copyout(&barg, data, sizeof (barg), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_BAR_READ: { + struct ppt_bar_io bio; + struct pptbar *pbar; + void *addr; + uint_t rnum; + ddi_acc_handle_t cfg; + + if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) { + return (EFAULT); + } + rnum = bio.pbi_bar; + if (rnum >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[rnum]; + if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) { + return (EINVAL); + } + addr = pbar->io_ptr + bio.pbi_off; + + switch (bio.pbi_width) { + case 4: + bio.pbi_data = ddi_get32(pbar->io_handle, addr); + break; + case 2: + bio.pbi_data = ddi_get16(pbar->io_handle, addr); + break; + case 1: + bio.pbi_data = ddi_get8(pbar->io_handle, addr); + break; + default: + return (EINVAL); + } + + if (ddi_copyout(&bio, data, sizeof (bio), md) != 0) { + return (EFAULT); + } + return (0); + } + case PPT_BAR_WRITE: { + struct ppt_bar_io bio; + struct pptbar *pbar; + void *addr; + uint_t rnum; + ddi_acc_handle_t cfg; + + if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) { + return (EFAULT); + } + rnum = bio.pbi_bar; + if (rnum >= PCI_BASE_NUM) { + return (EINVAL); + } + pbar = &ppt->pptd_bars[rnum]; + if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) { + return (EINVAL); + } + addr = pbar->io_ptr + bio.pbi_off; + + switch (bio.pbi_width) { + case 4: + ddi_put32(pbar->io_handle, addr, bio.pbi_data); + break; + case 2: + ddi_put16(pbar->io_handle, addr, bio.pbi_data); + break; + case 1: + ddi_put8(pbar->io_handle, addr, bio.pbi_data); + break; + default: + return (EINVAL); + } + + return (0); + } + + default: + return (ENOTTY); + } + + return (0); +} + +static int +ppt_find_msix_table_bar(struct pptdev *ppt) +{ + uint16_t base; + uint32_t off; + + if (PCI_CAP_LOCATE(ppt->pptd_cfg, PCI_CAP_ID_MSI_X, &base) != + DDI_SUCCESS) + return (-1); + + off = pci_config_get32(ppt->pptd_cfg, base + PCI_MSIX_TBL_OFFSET); + + if (off == PCI_EINVAL32) + return (-1); + + return (off & PCI_MSIX_TBL_BIR_MASK); +} + +static int +ppt_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len, + size_t *maplen, uint_t model) +{ + minor_t minor; + struct pptdev *ppt; + int err, bar; + uint_t ddireg; + + minor = getminor(dev); + + if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) + return (ENXIO); + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(model) != DDI_MODEL_NONE) + return (ENXIO); +#endif + + if (off < 0 || off != P2ALIGN(off, PAGESIZE)) + return (EINVAL); + + if ((bar = ppt_find_msix_table_bar(ppt)) == -1) + return (EINVAL); + + ddireg = ppt->pptd_bars[bar].ddireg; + + if (ddireg == 0) + return (EINVAL); + + err = devmap_devmem_setup(dhp, ppt->pptd_dip, NULL, ddireg, off, len, + PROT_USER | PROT_READ | PROT_WRITE, IOMEM_DATA_CACHED, &ppt_attr); + + if (err == DDI_SUCCESS) + *maplen = len; + + return (err); +} + +static void +ppt_bar_wipe(struct pptdev *ppt) +{ + uint_t i; + + for (i = 0; i < PCI_BASE_NUM; i++) { + struct pptbar *pbar = &ppt->pptd_bars[i]; + if (pbar->type == PCI_ADDR_IO && pbar->io_handle != NULL) { + ddi_regs_map_free(&pbar->io_handle); + } + } + bzero(&ppt->pptd_bars, sizeof (ppt->pptd_bars)); +} + +static int +ppt_bar_crawl(struct pptdev *ppt) +{ + pci_regspec_t *regs; + uint_t rcount, i; + int err = 0, rlen; + + if (ddi_getlongprop(DDI_DEV_T_ANY, ppt->pptd_dip, DDI_PROP_DONTPASS, + "assigned-addresses", (caddr_t)®s, &rlen) != DDI_PROP_SUCCESS) { + return (EIO); + } + + VERIFY3S(rlen, >, 0); + rcount = rlen / sizeof (pci_regspec_t); + for (i = 0; i < rcount; i++) { + pci_regspec_t *reg = ®s[i]; + struct pptbar *pbar; + uint_t bar, rnum; + + DTRACE_PROBE1(ppt__crawl__reg, pci_regspec_t *, reg); + bar = PCI_REG_REG_G(reg->pci_phys_hi); + if (!BAR_VALID(bar)) { + continue; + } + + rnum = BAR_TO_IDX(bar); + pbar = &ppt->pptd_bars[rnum]; + /* is this somehow already populated? */ + if (pbar->base != 0 || pbar->size != 0) { + err = EEXIST; + break; + } + + /* + * Register 0 corresponds to the PCI config space. + * The registers which match the assigned-addresses list are + * offset by 1. + */ + pbar->ddireg = i + 1; + + pbar->type = reg->pci_phys_hi & PCI_ADDR_MASK; + pbar->base = ((uint64_t)reg->pci_phys_mid << 32) | + (uint64_t)reg->pci_phys_low; + pbar->size = ((uint64_t)reg->pci_size_hi << 32) | + (uint64_t)reg->pci_size_low; + if (pbar->type == PCI_ADDR_IO) { + err = ddi_regs_map_setup(ppt->pptd_dip, rnum, + &pbar->io_ptr, 0, 0, &ppt_attr, &pbar->io_handle); + if (err != 0) { + break; + } + } + } + kmem_free(regs, rlen); + + if (err != 0) { + ppt_bar_wipe(ppt); + } + return (err); +} + +static boolean_t +ppt_bar_verify_mmio(struct pptdev *ppt, uint64_t base, uint64_t size) +{ + const uint64_t map_end = base + size; + + /* Zero-length or overflow mappings are not valid */ + if (map_end <= base) { + return (B_FALSE); + } + /* MMIO bounds should be page-aligned */ + if ((base & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (B_FALSE); + } + + for (uint_t i = 0; i < PCI_BASE_NUM; i++) { + const struct pptbar *bar = &ppt->pptd_bars[i]; + const uint64_t bar_end = bar->base + bar->size; + + /* Only memory BARs can be mapped */ + if (bar->type != PCI_ADDR_MEM32 && + bar->type != PCI_ADDR_MEM64) { + continue; + } + + /* Does the mapping fit within this BAR? */ + if (base < bar->base || base >= bar_end || + map_end < bar->base || map_end > bar_end) { + continue; + } + + /* This BAR satisfies the provided map */ + return (B_TRUE); + } + return (B_FALSE); +} + +static int +ppt_ddi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + struct pptdev *ppt = NULL; + char name[PPT_MAXNAMELEN]; + int inst; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + inst = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(ppt_state, inst) != DDI_SUCCESS) { + goto fail; + } + VERIFY(ppt = ddi_get_soft_state(ppt_state, inst)); + ppt->pptd_dip = dip; + ddi_set_driver_private(dip, ppt); + + if (pci_config_setup(dip, &ppt->pptd_cfg) != DDI_SUCCESS) { + goto fail; + } + if (ppt_bar_crawl(ppt) != 0) { + goto fail; + } + if (ddi_create_minor_node(dip, PPT_MINOR_NAME, S_IFCHR, inst, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; + } + + mutex_enter(&pptdev_mtx); + list_insert_tail(&pptdev_list, ppt); + mutex_exit(&pptdev_mtx); + + return (DDI_SUCCESS); + +fail: + if (ppt != NULL) { + ddi_remove_minor_node(dip, NULL); + if (ppt->pptd_cfg != NULL) { + pci_config_teardown(&ppt->pptd_cfg); + } + ppt_bar_wipe(ppt); + ddi_soft_state_free(ppt_state, inst); + } + return (DDI_FAILURE); +} + +static int +ppt_ddi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + struct pptdev *ppt; + int inst; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ppt = ddi_get_driver_private(dip); + inst = ddi_get_instance(dip); + + ASSERT3P(ddi_get_soft_state(ppt_state, inst), ==, ppt); + + mutex_enter(&pptdev_mtx); + if (ppt->vm != NULL) { + mutex_exit(&pptdev_mtx); + return (DDI_FAILURE); + } + list_remove(&pptdev_list, ppt); + mutex_exit(&pptdev_mtx); + + ddi_remove_minor_node(dip, PPT_MINOR_NAME); + ppt_bar_wipe(ppt); + pci_config_teardown(&ppt->pptd_cfg); + ddi_set_driver_private(dip, NULL); + ddi_soft_state_free(ppt_state, inst); + + return (DDI_SUCCESS); +} + +static int +ppt_ddi_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error = DDI_FAILURE; + int inst = getminor((dev_t)arg); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: { + struct pptdev *ppt = ddi_get_soft_state(ppt_state, inst); + + if (ppt != NULL) { + *result = (void *)ppt->pptd_dip; + error = DDI_SUCCESS; + } + break; + } + case DDI_INFO_DEVT2INSTANCE: { + *result = (void *)(uintptr_t)inst; + error = DDI_SUCCESS; + break; + } + default: + break; + } + return (error); +} + +static struct cb_ops ppt_cb_ops = { + ppt_open, + nulldev, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + ppt_ioctl, + ppt_devmap, /* devmap */ + NULL, /* mmap */ + NULL, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_64BIT | D_DEVMAP, + CB_REV +}; + +static struct dev_ops ppt_ops = { + DEVO_REV, + 0, + ppt_ddi_info, + nulldev, /* identify */ + nulldev, /* probe */ + ppt_ddi_attach, + ppt_ddi_detach, + nodev, /* reset */ + &ppt_cb_ops, + (struct bus_ops *)NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "bhyve pci pass-thru", + &ppt_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + mutex_init(&pptdev_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&pptdev_list, sizeof (struct pptdev), + offsetof(struct pptdev, pptd_node)); + + error = ddi_soft_state_init(&ppt_state, sizeof (struct pptdev), 0); + if (error) { + goto fail; + } + + error = mod_install(&modlinkage); + + ppt_major = ddi_name_to_major("ppt"); +fail: + if (error) { + ddi_soft_state_fini(&ppt_state); + } + return (error); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + ddi_soft_state_fini(&ppt_state); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static boolean_t +ppt_wait_for_pending_txn(dev_info_t *dip, uint_t max_delay_us) +{ + uint16_t cap_ptr, devsts; + ddi_acc_handle_t hdl; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (B_FALSE); + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) { + pci_config_teardown(&hdl); + return (B_FALSE); + } + + devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS); + while ((devsts & PCIE_DEVSTS_TRANS_PENDING) != 0) { + if (max_delay_us == 0) { + pci_config_teardown(&hdl); + return (B_FALSE); + } + + /* Poll once every 100 milliseconds up to the timeout. */ + if (max_delay_us > 100000) { + delay(drv_usectohz(100000)); + max_delay_us -= 100000; + } else { + delay(drv_usectohz(max_delay_us)); + max_delay_us = 0; + } + devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS); + } + + pci_config_teardown(&hdl); + return (B_TRUE); +} + +static uint_t +ppt_max_completion_tmo_us(dev_info_t *dip) +{ + uint_t timo = 0; + uint16_t cap_ptr; + ddi_acc_handle_t hdl; + uint_t timo_ranges[] = { /* timeout ranges */ + 50000, /* 50ms */ + 100, /* 100us */ + 10000, /* 10ms */ + 0, + 0, + 55000, /* 55ms */ + 210000, /* 210ms */ + 0, + 0, + 900000, /* 900ms */ + 3500000, /* 3.5s */ + 0, + 0, + 13000000, /* 13s */ + 64000000, /* 64s */ + 0 + }; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (50000); /* default 50ms */ + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) + goto out; + + if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_PCIECAP) & + PCIE_PCIECAP_VER_MASK) < PCIE_PCIECAP_VER_2_0) + goto out; + + if ((PCI_CAP_GET32(hdl, 0, cap_ptr, PCIE_DEVCAP2) & + PCIE_DEVCTL2_COM_TO_RANGE_MASK) == 0) + goto out; + + timo = timo_ranges[PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL2) & + PCIE_DEVCAP2_COM_TO_RANGE_MASK]; + +out: + if (timo == 0) + timo = 50000; /* default 50ms */ + + pci_config_teardown(&hdl); + return (timo); +} + +static boolean_t +ppt_flr(dev_info_t *dip, boolean_t force) +{ + uint16_t cap_ptr, ctl, cmd; + ddi_acc_handle_t hdl; + uint_t compl_delay = 0, max_delay_us; + + if (pci_config_setup(dip, &hdl) != DDI_SUCCESS) + return (B_FALSE); + + if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) + goto fail; + + if ((PCI_CAP_GET32(hdl, 0, cap_ptr, PCIE_DEVCAP) & PCIE_DEVCAP_FLR) + == 0) + goto fail; + + max_delay_us = MAX(ppt_max_completion_tmo_us(dip), 10000); + + /* + * Disable busmastering to prevent generation of new transactions while + * waiting for the device to go idle. If the idle timeout fails, the + * command register is restored which will re-enable busmastering. + */ + cmd = pci_config_get16(hdl, PCI_CONF_COMM); + pci_config_put16(hdl, PCI_CONF_COMM, cmd & ~PCI_COMM_ME); + if (!ppt_wait_for_pending_txn(dip, max_delay_us)) { + if (!force) { + pci_config_put16(hdl, PCI_CONF_COMM, cmd); + goto fail; + } + dev_err(dip, CE_WARN, + "?Resetting with transactions pending after %u us\n", + max_delay_us); + + /* + * Extend the post-FLR delay to cover the maximum Completion + * Timeout delay of anything in flight during the FLR delay. + * Enforce a minimum delay of at least 10ms. + */ + compl_delay = MAX(10, (ppt_max_completion_tmo_us(dip) / 1000)); + } + + /* Initiate the reset. */ + ctl = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL); + (void) PCI_CAP_PUT16(hdl, 0, cap_ptr, PCIE_DEVCTL, + ctl | PCIE_DEVCTL_INITIATE_FLR); + + /* Wait for at least 100ms */ + delay(drv_usectohz((100 + compl_delay) * 1000)); + + pci_config_teardown(&hdl); + return (B_TRUE); + +fail: + /* + * TODO: If the FLR fails for some reason, we should attempt a reset + * using the PCI power management facilities (if possible). + */ + pci_config_teardown(&hdl); + return (B_FALSE); +} + +static int +ppt_findf(struct vm *vm, int fd, struct pptdev **pptp) +{ + struct pptdev *ppt = NULL; + file_t *fp; + vattr_t va; + int err = 0; + + ASSERT(MUTEX_HELD(&pptdev_mtx)); + + if ((fp = getf(fd)) == NULL) + return (EBADF); + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 || + getmajor(va.va_rdev) != ppt_major) { + err = EBADF; + goto fail; + } + + ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev)); + + if (ppt == NULL) { + err = EBADF; + goto fail; + } + + if (ppt->vm != vm) { + err = EBUSY; + goto fail; + } + + *pptp = ppt; + return (0); + +fail: + releasef(fd); + return (err); +} + +static void +ppt_unmap_all_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct pptseg *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void) vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof (struct pptseg)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + int intr_cap; + + (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + ddi_intr_block_disable(&ppt->msi.inth[i], 1); + else + ddi_intr_disable(ppt->msi.inth[i]); + + ddi_intr_remove_handler(ppt->msi.inth[i]); + ddi_intr_free(ppt->msi.inth[i]); + + ppt->msi.inth[i] = NULL; + } + + kmem_free(ppt->msi.inth, ppt->msi.inth_sz); + ppt->msi.inth = NULL; + ppt->msi.inth_sz = 0; + ppt->msi.is_fixed = B_FALSE; + + ppt->msi.num_msgs = 0; +} + +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + if (ppt->msix.inth != NULL && ppt->msix.inth[idx] != NULL) { + int intr_cap; + + (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + ddi_intr_block_disable(&ppt->msix.inth[idx], 1); + else + ddi_intr_disable(ppt->msix.inth[idx]); + + ddi_intr_remove_handler(ppt->msix.inth[idx]); + } +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + uint_t i; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + if (ppt->msix.inth) { + for (i = 0; i < ppt->msix.num_msgs; i++) + ddi_intr_free(ppt->msix.inth[i]); + kmem_free(ppt->msix.inth, ppt->msix.inth_sz); + ppt->msix.inth = NULL; + ppt->msix.inth_sz = 0; + kmem_free(ppt->msix.arg, ppt->msix.arg_sz); + ppt->msix.arg = NULL; + ppt->msix.arg_sz = 0; + } + + ppt->msix.num_msgs = 0; +} + +int +ppt_assigned_devices(struct vm *vm) +{ + struct pptdev *ppt; + uint_t num = 0; + + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm == vm) { + num++; + } + } + mutex_exit(&pptdev_mtx); + return (num); +} + +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + struct pptdev *ppt = list_head(&pptdev_list); + + /* XXX: this should probably be restructured to avoid the lock */ + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm != vm) { + continue; + } + + for (uint_t i = 0; i < MAX_MMIOSEGS; i++) { + struct pptseg *seg = &ppt->mmio[i]; + + if (seg->len == 0) + continue; + if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) { + mutex_exit(&pptdev_mtx); + return (B_TRUE); + } + } + } + + mutex_exit(&pptdev_mtx); + return (B_FALSE); +} + +int +ppt_assign_device(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + /* Passing NULL requires the device to be unowned. */ + err = ppt_findf(NULL, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) { + err = EIO; + goto done; + } + ppt_flr(ppt->pptd_dip, B_TRUE); + + /* + * Restore the device state after reset and then perform another save + * so the "pristine" state can be restored when the device is removed + * from the guest. + */ + if (pci_restore_config_regs(ppt->pptd_dip) != DDI_SUCCESS || + pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) { + err = EIO; + goto done; + } + + ppt->vm = vm; + iommu_remove_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip)); + iommu_add_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip)); + pf_set_passthru(ppt->pptd_dip, B_TRUE); + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +static void +ppt_reset_pci_power_state(dev_info_t *dip) +{ + ddi_acc_handle_t cfg; + uint16_t cap_ptr; + + if (pci_config_setup(dip, &cfg) != DDI_SUCCESS) + return; + + if (PCI_CAP_LOCATE(cfg, PCI_CAP_ID_PM, &cap_ptr) == DDI_SUCCESS) { + uint16_t val; + + val = PCI_CAP_GET16(cfg, 0, cap_ptr, PCI_PMCSR); + if ((val & PCI_PMCSR_STATE_MASK) != PCI_PMCSR_D0) { + val = (val & ~PCI_PMCSR_STATE_MASK) | PCI_PMCSR_D0; + (void) PCI_CAP_PUT16(cfg, 0, cap_ptr, PCI_PMCSR, + val); + } + } + + pci_config_teardown(&cfg); +} + +static void +ppt_do_unassign(struct pptdev *ppt) +{ + struct vm *vm = ppt->vm; + + ASSERT3P(vm, !=, NULL); + ASSERT(MUTEX_HELD(&pptdev_mtx)); + + + ppt_flr(ppt->pptd_dip, B_TRUE); + + /* + * Restore from the state saved during device assignment. + * If the device power state has been altered, that must be remedied + * first, as it will reset register state during the transition. + */ + ppt_reset_pci_power_state(ppt->pptd_dip); + (void) pci_restore_config_regs(ppt->pptd_dip); + + pf_set_passthru(ppt->pptd_dip, B_FALSE); + + ppt_unmap_all_mmio(vm, ppt); + ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); + iommu_remove_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip)); + iommu_add_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip)); + ppt->vm = NULL; +} + +int +ppt_unassign_device(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + ppt_do_unassign(ppt); + + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_unassign_all(struct vm *vm) +{ + struct pptdev *ppt; + + mutex_enter(&pptdev_mtx); + for (ppt = list_head(&pptdev_list); ppt != NULL; + ppt = list_next(&pptdev_list, ppt)) { + if (ppt->vm == vm) { + ppt_do_unassign(ppt); + } + } + mutex_exit(&pptdev_mtx); + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + /* + * Ensure that the host-physical range of the requested mapping fits + * within one of the MMIO BARs of the device. + */ + if (!ppt_bar_verify_mmio(ppt, hpa, len)) { + err = EINVAL; + goto done; + } + + for (uint_t i = 0; i < MAX_MMIOSEGS; i++) { + struct pptseg *seg = &ppt->mmio[i]; + + if (seg->len == 0) { + err = vm_map_mmio(vm, gpa, len, hpa); + if (err == 0) { + seg->gpa = gpa; + seg->len = len; + } + goto done; + } + } + err = ENOSPC; + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_unmap_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len) +{ + struct pptdev *ppt; + int err = 0; + uint_t i; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + for (i = 0; i < MAX_MMIOSEGS; i++) { + struct pptseg *seg = &ppt->mmio[i]; + + if (seg->gpa == gpa && seg->len == len) { + err = vm_unmap_mmio(vm, seg->gpa, seg->len); + if (err == 0) { + seg->gpa = 0; + seg->len = 0; + } + goto out; + } + } + err = ENOENT; +out: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +static uint_t +pptintr(caddr_t arg, caddr_t unused) +{ + struct pptintr_arg *pptarg = (struct pptintr_arg *)arg; + struct pptdev *ppt = pptarg->pptdev; + + if (ppt->vm != NULL) { + lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); + } else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + return (ppt->msi.is_fixed ? DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg, + int numvec) +{ + int i, msi_count, intr_type; + struct pptdev *ppt; + int err = 0; + + if (numvec < 0 || numvec > MAX_MSIMSGS) + return (EINVAL); + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + /* Reject attempts to enable MSI while MSI-X is active. */ + if (ppt->msix.num_msgs != 0 && numvec != 0) { + err = EBUSY; + goto done; + } + + /* Free any allocated resources */ + ppt_teardown_msi(ppt); + + if (numvec == 0) { + /* nothing more to do */ + goto done; + } + + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI, + &msi_count) != DDI_SUCCESS) { + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_FIXED, + &msi_count) != DDI_SUCCESS) { + err = EINVAL; + goto done; + } + + intr_type = DDI_INTR_TYPE_FIXED; + ppt->msi.is_fixed = B_TRUE; + } else { + intr_type = DDI_INTR_TYPE_MSI; + } + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) { + err = EINVAL; + goto done; + } + + ppt->msi.inth_sz = numvec * sizeof (ddi_intr_handle_t); + ppt->msi.inth = kmem_zalloc(ppt->msi.inth_sz, KM_SLEEP); + if (ddi_intr_alloc(ppt->pptd_dip, ppt->msi.inth, intr_type, 0, + numvec, &msi_count, 0) != DDI_SUCCESS) { + kmem_free(ppt->msi.inth, ppt->msi.inth_sz); + err = EINVAL; + goto done; + } + + /* Verify that we got as many vectors as the guest requested */ + if (numvec != msi_count) { + ppt_teardown_msi(ppt); + err = EINVAL; + goto done; + } + + /* Set up & enable interrupt handler for each vector. */ + for (i = 0; i < numvec; i++) { + int res, intr_cap = 0; + + ppt->msi.num_msgs = i + 1; + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].addr = addr; + ppt->msi.arg[i].msg_data = msg + i; + + if (ddi_intr_add_handler(ppt->msi.inth[i], pptintr, + &ppt->msi.arg[i], NULL) != DDI_SUCCESS) + break; + + (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + res = ddi_intr_block_enable(&ppt->msi.inth[i], 1); + else + res = ddi_intr_enable(ppt->msi.inth[i]); + + if (res != DDI_SUCCESS) + break; + } + if (i < numvec) { + ppt_teardown_msi(ppt); + err = ENXIO; + } + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, + uint64_t msg, uint32_t vector_control) +{ + struct pptdev *ppt; + int numvec, alloced; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + /* Reject attempts to enable MSI-X while MSI is active. */ + if (ppt->msi.num_msgs != 0) { + err = EBUSY; + goto done; + } + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (ppt->msix.num_msgs == 0) { + dev_info_t *dip = ppt->pptd_dip; + + if (ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, + &numvec) != DDI_SUCCESS) { + err = EINVAL; + goto done; + } + + ppt->msix.num_msgs = numvec; + + ppt->msix.arg_sz = numvec * sizeof (ppt->msix.arg[0]); + ppt->msix.arg = kmem_zalloc(ppt->msix.arg_sz, KM_SLEEP); + ppt->msix.inth_sz = numvec * sizeof (ddi_intr_handle_t); + ppt->msix.inth = kmem_zalloc(ppt->msix.inth_sz, KM_SLEEP); + + if (ddi_intr_alloc(dip, ppt->msix.inth, DDI_INTR_TYPE_MSIX, 0, + numvec, &alloced, 0) != DDI_SUCCESS) { + kmem_free(ppt->msix.arg, ppt->msix.arg_sz); + kmem_free(ppt->msix.inth, ppt->msix.inth_sz); + ppt->msix.arg = NULL; + ppt->msix.inth = NULL; + ppt->msix.arg_sz = ppt->msix.inth_sz = 0; + err = EINVAL; + goto done; + } + + if (numvec != alloced) { + ppt_teardown_msix(ppt); + err = EINVAL; + goto done; + } + } + + if (idx >= ppt->msix.num_msgs) { + err = EINVAL; + goto done; + } + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + int intr_cap, res; + + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].addr = addr; + ppt->msix.arg[idx].msg_data = msg; + + /* Setup the MSI-X interrupt */ + if (ddi_intr_add_handler(ppt->msix.inth[idx], pptintr, + &ppt->msix.arg[idx], NULL) != DDI_SUCCESS) { + err = ENXIO; + goto done; + } + + (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap); + if (intr_cap & DDI_INTR_FLAG_BLOCK) + res = ddi_intr_block_enable(&ppt->msix.inth[idx], 1); + else + res = ddi_intr_enable(ppt->msix.inth[idx]); + + if (res != DDI_SUCCESS) { + ddi_intr_remove_handler(ppt->msix.inth[idx]); + err = ENXIO; + goto done; + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + +done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI, + msilimit) != DDI_SUCCESS) { + *msilimit = -1; + } + if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSIX, + msixlimit) != DDI_SUCCESS) { + *msixlimit = -1; + } + + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_disable_msix(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + ppt_teardown_msix(ppt); + + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} diff --git a/usr/src/uts/intel/io/vmm/io/ppt.conf b/usr/src/uts/intel/io/vmm/io/ppt.conf new file mode 100644 index 0000000000..0485580bb8 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/ppt.conf @@ -0,0 +1,15 @@ +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE + +# +# Copyright 2017 Joyent, Inc. +# + diff --git a/usr/src/uts/intel/io/vmm/io/ppt.h b/usr/src/uts/intel/io/vmm/io/ppt.h new file mode 100644 index 0000000000..f69a352fe0 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/ppt.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +int ppt_unmap_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len); +int ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, + uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, + uint64_t msg, uint32_t vector_control); +int ppt_disable_msix(struct vm *vm, int pptfd); +int ppt_assigned_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); +int ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int pptfd); +int ppt_unassign_device(struct vm *vm, int pptfd); +#endif diff --git a/usr/src/uts/intel/io/vmm/io/ppt.mapfile b/usr/src/uts/intel/io/vmm/io/ppt.mapfile new file mode 100644 index 0000000000..42d92f0066 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/ppt.mapfile @@ -0,0 +1,56 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE + +# +# Copyright 2019 Joyent, Inc. +# Copyright 2021 OmniOS Community Edition (OmniOSce) Association. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # PCI pass-thru API for bhyve + ppt_assigned_devices; + ppt_is_mmio; + ppt_assign_device; + ppt_disable_msix; + ppt_unassign_device; + ppt_unassign_all; + ppt_map_mmio; + ppt_unmap_mmio; + ppt_setup_msi; + ppt_setup_msix; + ppt_get_limits; + + local: + *; +}; diff --git a/usr/src/uts/intel/io/vmm/io/vatpic.c b/usr/src/uts/intel/io/vmm/io/vatpic.c new file mode 100644 index 0000000000..aa3ae4186d --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vatpic.c @@ -0,0 +1,825 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <x86/apicreg.h> +#include <dev/ic/i8259.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vioapic.h" +#include "vatpic.h" + +static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); + +#define VATPIC_LOCK(vatpic) mutex_enter(&((vatpic)->lock)) +#define VATPIC_UNLOCK(vatpic) mutex_exit(&((vatpic)->lock)) +#define VATPIC_LOCKED(vatpic) MUTEX_HELD(&((vatpic)->lock)) + +#define IRQ_BASE_MASK 0xf8 + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +enum icw_state { + IS_ICW1 = 0, + IS_ICW2, + IS_ICW3, + IS_ICW4, +}; + +struct atpic { + enum icw_state icw_state; + + bool ready; + bool auto_eoi; + bool poll; + bool rotate; + bool special_full_nested; + bool read_isr_next; + bool intr_raised; + bool special_mask_mode; + + uint8_t reg_irr; /* Interrupt Request Register (IIR) */ + uint8_t reg_isr; /* Interrupt Service (ISR) */ + uint8_t reg_imr; /* Interrupt Mask Register (IMR) */ + uint8_t irq_base; /* base interrupt vector */ + uint8_t lowprio; /* lowest priority irq */ + uint8_t elc; /* level-triggered mode bits */ + + uint_t acnt[8]; /* sum of pin asserts and deasserts */ +}; + +struct atpic_stats { + uint64_t as_interrupts; + uint64_t as_saturate_low; + uint64_t as_saturate_high; +}; + +struct vatpic { + struct vm *vm; + kmutex_t lock; + struct atpic atpic[2]; + struct atpic_stats stats; +}; + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static int vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->reg_isr & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->special_mask_mode && + (atpic->reg_imr & bit) != 0) { + continue; + } else { + return (pin); + } + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->reg_isr; + if (atpic->special_full_nested) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->special_mask_mode) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->reg_irr & bit) != 0 && (atpic->reg_imr & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + ASSERT(VATPIC_LOCKED(vatpic)); + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->reg_imr, atpic->reg_irr, atpic->reg_isr); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + if (vatpic_set_pinstate(vatpic, 2, true) == 0) { + (void) vatpic_set_pinstate(vatpic, 2, false); + } + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->reg_imr, atpic->reg_irr, atpic->reg_isr); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->reg_imr, atpic->reg_irr, atpic->reg_isr); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + vatpic->stats.as_interrupts++; + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->reg_imr, atpic->reg_irr, atpic->reg_isr); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + atpic->ready = false; + + atpic->icw_state = IS_ICW1; + atpic->reg_irr = 0; + atpic->reg_imr = 0; + atpic->lowprio = 7; + atpic->read_isr_next = false; + atpic->poll = false; + atpic->special_mask_mode = false; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_state = IS_ICW2; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + atpic->irq_base = val & IRQ_BASE_MASK; + atpic->icw_state = IS_ICW3; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + atpic->icw_state = IS_ICW4; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + atpic->auto_eoi = (val & ICW4_AEOI) != 0; + if (master_atpic(vatpic, atpic)) { + atpic->special_full_nested = (val & ICW4_SFNM) != 0; + } + + atpic->icw_state = IS_ICW1; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + atpic->reg_imr = val; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + atpic->rotate = (val & OCW2_R) != 0; + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->reg_isr &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + if ((val & OCW3_ESMM) != 0) { + atpic->special_mask_mode = (val & OCW3_SMM) != 0; + } + if ((val & OCW3_RR) != 0) { + atpic->read_isr_next = (val & OCW3_RIS) != 0; + } + if ((val & OCW3_P) != 0) { + atpic->poll = true; + } + + return (0); +} + +static int +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + uint_t oldcnt, newcnt; + int err = 0; + + VERIFY(pin >= 0 && pin < 16); + ASSERT(VATPIC_LOCKED(vatpic)); + + const int lpin = pin & 0x7; + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = newcnt = atpic->acnt[lpin]; + if (newstate) { + if (newcnt != UINT_MAX) { + newcnt++; + } else { + err = E2BIG; + DTRACE_PROBE2(vatpic__sat_high, struct vatpic *, vatpic, + int, pin); + vatpic->stats.as_saturate_high++; + } + } else { + if (newcnt != 0) { + newcnt--; + } else { + err = ERANGE; + DTRACE_PROBE2(vatpic__sat_low, struct vatpic *, vatpic, + int, pin); + vatpic->stats.as_saturate_low++; + } + } + atpic->acnt[lpin] = newcnt; + + const bool level = ((atpic->elc & (1 << (lpin))) != 0); + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + DTRACE_PROBE2(vatpic__assert, struct vatpic *, vatpic, + int, pin); + atpic->reg_irr |= (1 << lpin); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + DTRACE_PROBE2(vatpic__deassert, struct vatpic *, vatpic, + int, pin); + if (level) { + atpic->reg_irr &= ~(1 << lpin); + } + } + + vatpic_notify_intr(vatpic); + return (err); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int err = 0; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (!atpic->ready) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + err = vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + err = vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + err = vatpic_set_pinstate(vatpic, irq, true); + if (err == 0) { + err = vatpic_set_pinstate(vatpic, irq, false); + } + break; + default: + panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); + } + VATPIC_UNLOCK(vatpic); + + return (err); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comments in vatpic_elc_handler. + * These IRQs must be edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + struct vatpic *vatpic = vm_atpic(vm); + struct atpic *atpic = &vatpic->atpic[irq >> 3]; + const int pin = irq & 0x7; + + VATPIC_LOCK(vatpic); + if (trigger == LEVEL_TRIGGER) { + atpic->elc |= (1 << pin); + } else { + atpic->elc &= ~(1 << pin); + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + ASSERT(pin >= 0 && pin < 8); + + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->reg_irr &= ~(1 << pin); + + if (atpic->auto_eoi) { + if (atpic->rotate) + atpic->lowprio = pin; + } else { + atpic->reg_isr |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & IRQ_BASE_MASK) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int pin; + + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = false; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | pin; + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->reg_imr; + } else { + if (atpic->read_isr_next) { + /* read interrupt service register */ + *eax = atpic->reg_isr; + } else { + /* read interrupt request register */ + *eax = atpic->reg_irr; + } + } + } + + VATPIC_UNLOCK(vatpic); + + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = *eax; + + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_state) { + case IS_ICW2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case IS_ICW3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case IS_ICW4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax) +{ + struct vatpic *vatpic = arg; + struct atpic *atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax) +{ + struct vatpic *vatpic = arg; + struct atpic *atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax) +{ + struct vatpic *vatpic = arg; + struct atpic *atpic = NULL; + uint8_t elc_mask = 0; + + switch (port) { + case IO_ELCR1: + atpic = &vatpic->atpic[0]; + /* + * For the master PIC the cascade channel (IRQ2), the heart beat + * timer (IRQ0), and the keyboard controller (IRQ1) cannot be + * programmed for level mode. + */ + elc_mask = 0xf8; + break; + case IO_ELCR2: + atpic = &vatpic->atpic[1]; + /* + * For the slave PIC the real time clock (IRQ8) and the floating + * point error interrupt (IRQ13) cannot be programmed for level + * mode. + */ + elc_mask = 0xde; + break; + default: + return (-1); + } + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + if (in) { + *eax = atpic->elc; + } else { + atpic->elc = *eax & elc_mask; + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof (struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); + vatpic->vm = vm; + + mutex_init(&vatpic->lock, NULL, MUTEX_ADAPTIVE, NULL); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + mutex_destroy(&vatpic->lock); + free(vatpic, M_VATPIC); +} diff --git a/usr/src/uts/intel/io/vmm/io/vatpic.h b/usr/src/uts/intel/io/vmm/io/vatpic.h new file mode 100644 index 0000000000..4ed51a06ed --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vatpic.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIC_H_ +#define _VATPIC_H_ + +#include <isa/isareg.h> + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax); +int vatpic_slave_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax); +int vatpic_elc_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, + enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); + +#endif /* _VATPIC_H_ */ diff --git a/usr/src/uts/intel/io/vmm/io/vatpit.c b/usr/src/uts/intel/io/vmm/io/vatpit.c new file mode 100644 index 0000000000..9616444d25 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vatpit.c @@ -0,0 +1,489 @@ +/*- + * Copyright (c) 2018 Joyent, Inc. + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vatpit.h" + +static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); + +#define VATPIT_LOCK(vatpit) mutex_enter(&((vatpit)->lock)) +#define VATPIT_UNLOCK(vatpit) mutex_exit(&((vatpit)->lock)) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + +struct channel { + uint8_t mode; + uint16_t initial; /* initial counter value */ + + uint8_t reg_cr[2]; + uint8_t reg_ol[2]; + uint8_t reg_status; + + bool slatched; /* status latched */ + bool olatched; /* output latched */ + bool cr_sel; /* read MSB from control register */ + bool ol_sel; /* read MSB from output latch */ + bool fr_sel; /* read MSB from free-running timer */ + + hrtime_t time_loaded; /* time when counter was loaded */ + hrtime_t time_target; /* target time */ + uint64_t total_target; + + struct callout callout; + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + kmutex_t lock; + + struct channel channel[3]; +}; + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static uint64_t +vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) +{ + const hrtime_t delta = gethrtime() - c->time_loaded; + + return (hrt_freq_count(delta, PIT_8254_FREQ)); +} + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + uint64_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = vatpit_delta_ticks(vatpit, c); + out = (delta_ticks >= c->initial); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN || c->mode == TIMER_SQWAVE) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c = &vatpit->channel[0]; + + if (c->initial == 0) { + return; + } + + c->total_target += c->initial; + c->time_target = c->time_loaded + + hrt_freq_interval(PIT_8254_FREQ, c->total_target); + + /* + * If we are more than 'c->initial' ticks behind, reset the timer base + * to fire at the next 'c->initial' interval boundary. + */ + hrtime_t now = gethrtime(); + if (c->time_target < now) { + const uint64_t ticks_behind = + hrt_freq_count(c->time_target - now, PIT_8254_FREQ); + + c->total_target += roundup(ticks_behind, c->initial); + c->time_target = c->time_loaded + + hrt_freq_interval(PIT_8254_FREQ, c->total_target); + } + + callout_reset_hrtime(&c->callout, c->time_target, + vatpit_callout_handler, &c->callout_arg, C_ABSOLUTE); +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + uint64_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olatched) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an OS bug - reading the value of the timer + * without having set up the initial value. + * + * The original user-space version of this code set the timer to + * 100hz in this condition; do the same here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + c->time_loaded = gethrtime(); + c->reg_status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = vatpit_delta_ticks(vatpit, c); + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olatched = true; + c->ol_sel = true; + c->reg_ol[1] = lval; /* LSB */ + c->reg_ol[0] = lval >> 8; /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if ((cmd & TIMER_RB_LCTR) == 0 && !c->olatched) { + (void) pit_update_counter(vatpit, c, true); + } + + if ((cmd & TIMER_RB_LSTATUS) == 0 && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->reg_status |= TIMER_STS_OUT; + else + c->reg_status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw; + uint8_t mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + /* Clear don't-care bit (M2) when M1 is set */ + if ((mode & TIMER_RATEGEN) != 0) { + mode &= ~TIMER_SWSTROBE; + } + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) { + pit_update_counter(vatpit, c, true); + } else { + c->mode = mode; + c->olatched = false; /* reset latch after reprogramming */ + c->reg_status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(void *arg, bool in, uint16_t port, uint8_t bytes, uint32_t *eax) +{ + struct vatpit *vatpit = arg; + struct channel *c; + uint8_t val; + int error; + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* Return the status byte if latched */ + *eax = c->reg_status; + c->slatched = false; + c->reg_status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (!c->olatched) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->fr_sel) { + tmp >>= 8; + } + tmp &= 0xff; + *eax = tmp; + c->fr_sel = !c->fr_sel; + } else { + if (c->ol_sel) { + *eax = c->reg_ol[1]; + c->ol_sel = false; + } else { + *eax = c->reg_ol[0]; + c->olatched = false; + } + } + } else { + if (!c->cr_sel) { + c->reg_cr[0] = *eax; + c->cr_sel = true; + } else { + c->reg_cr[1] = *eax; + c->cr_sel = false; + + c->reg_status &= ~TIMER_STS_NULLCNT; + c->fr_sel = false; + c->initial = c->reg_cr[0] | (uint16_t)c->reg_cr[1] << 8; + c->time_loaded = gethrtime(); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->time_target = c->time_loaded; + c->total_target = 0; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax) +{ + struct vatpit *vatpit = arg; + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof (struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); + vatpit->vm = vm; + + mutex_init(&vatpit->lock, NULL, MUTEX_ADAPTIVE, NULL); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, 1); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + mutex_destroy(&vatpit->lock); + free(vatpit, M_VATPIT); +} + +void +vatpit_localize_resources(struct vatpit *vatpit) +{ + for (uint_t i = 0; i < 3; i++) { + /* Only localize channels which might be running */ + if (vatpit->channel[i].mode != 0) { + vmm_glue_callout_localize(&vatpit->channel[i].callout); + } + } +} diff --git a/usr/src/uts/intel/io/vmm/io/vatpit.h b/usr/src/uts/intel/io/vmm/io/vatpit.h new file mode 100644 index 0000000000..bee3a88293 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vatpit.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIT_H_ +#define _VATPIT_H_ + +#include <machine/timerreg.h> + +#define NMISC_PORT 0x61 + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax); +int vatpit_nmisc_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *eax); + +void vatpit_localize_resources(struct vatpit *); + +#endif /* _VATPIT_H_ */ diff --git a/usr/src/uts/intel/io/vmm/io/vhpet.c b/usr/src/uts/intel/io/vmm/io/vhpet.c new file mode 100644 index 0000000000..d9515d0cc3 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vhpet.c @@ -0,0 +1,764 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <dev/acpica/acpi_hpet.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_lapic.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vhpet.h" + +#include "vmm_ktr.h" + +static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); + +#define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet_timer { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + hrtime_t callout_expire; /* time when counter==compval */ + struct vhpet_callout_arg arg; +}; + +struct vhpet { + struct vm *vm; + kmutex_t lock; + + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t base_count; /* HPET counter base value */ + hrtime_t base_time; /* uptime corresponding to base value */ + + struct vhpet_timer timer[VHPET_NUM_TIMERS]; +}; + +#define VHPET_LOCK(vhp) mutex_enter(&((vhp)->lock)) +#define VHPET_UNLOCK(vhp) mutex_exit(&((vhp)->lock)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + hrtime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= 0x8086 << 16; /* vendor id */ + cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ + cap |= 1; /* revision */ + cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ + + cap &= 0xffffffff; + cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, hrtime_t *nowptr) +{ + const hrtime_t now = gethrtime(); + uint32_t val = vhpet->base_count; + + if (vhpet_counter_enabled(vhpet)) { + const hrtime_t delta = now - vhpet->base_time; + + ASSERT3S(delta, >=, 0); + val += hrt_freq_count(delta, HPET_FREQ); + } else { + /* Value of the counter is meaningless when it is disabled */ + } + + if (nowptr != NULL) { + *nowptr = now; + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + hrtime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + panic("vhpet(%p) callout with counter disabled", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, hrtime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_expire < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, hrtime_t now) +{ + struct vhpet_timer *timer = &vhpet->timer[n]; + + if (timer->comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + const hrtime_t delta = hrt_freq_interval(HPET_FREQ, + timer->compval - counter); + timer->callout_expire = now + delta; + callout_reset_hrtime(&timer->callout, timer->callout_expire, + vhpet_handler, &timer->arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->base_time = gethrtime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->base_count, + vhpet->base_time); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, hrtime_t now) +{ + int i; + + vhpet->base_count = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + panic("vhpet timer %d isr should not be asserted", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + hrtime_t now; + int i, offset; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'base_count' with the value right + * before it is disabled. + */ + counter = vhpet_counter(vhpet, &now); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = vhpet->isr & data; + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->base_count = val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = val64; + } + vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + + vhpet = malloc(sizeof (struct vhpet), M_VHPET, M_WAITOK | M_ZERO); + vhpet->vm = vm; + mutex_init(&vhpet->lock, NULL, MUTEX_ADAPTIVE, NULL); + + pincount = vioapic_pincount(vm); + if (pincount >= 32) + allowed_irqs = 0xff000000; /* irqs 24-31 */ + else if (pincount >= 20) + allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + mutex_destroy(&vhpet->lock); + free(vhpet, M_VHPET); +} + +int +vhpet_getcap(struct vm_hpet_cap *cap) +{ + + cap->capabilities = vhpet_capabilities(); + return (0); +} +void +vhpet_localize_resources(struct vhpet *vhpet) +{ + for (uint_t i = 0; i < VHPET_NUM_TIMERS; i++) { + vmm_glue_callout_localize(&vhpet->timer[i].callout); + } +} diff --git a/usr/src/uts/intel/io/vmm/io/vhpet.h b/usr/src/uts/intel/io/vmm/io/vhpet.h new file mode 100644 index 0000000000..0ea0a6b15a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vhpet.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VHPET_H_ +#define _VHPET_H_ + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 1024 + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size); +int vhpet_mmio_read(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t *val, + int size); +int vhpet_getcap(struct vm_hpet_cap *cap); + +void vhpet_localize_resources(struct vhpet *vhpet); + +#endif /* _VHPET_H_ */ diff --git a/usr/src/uts/intel/io/vmm/io/vioapic.c b/usr/src/uts/intel/io/vmm/io/vioapic.c new file mode 100644 index 0000000000..90dedb9ac1 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vioapic.c @@ -0,0 +1,475 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/cpuset.h> + +#include <x86/apicreg.h> +#include <machine/vmm.h> + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" +#include "vioapic.h" + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 32 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +struct ioapic_stats { + uint64_t is_interrupts; + uint64_t is_saturate_low; + uint64_t is_saturate_high; +}; + +struct vioapic { + struct vm *vm; + kmutex_t lock; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + /* + * The sum of pin asserts (+1) and deasserts (-1) are tracked in + * 'acnt'. It is clamped to prevent overflow or underflow + * should emulation consumers feed it an invalid set of + * transitions. + */ + uint_t acnt; + } rtbl[REDIR_ENTRIES]; + struct ioapic_stats stats; +}; + +#define VIOAPIC_LOCK(vioapic) mutex_enter(&((vioapic)->lock)) +#define VIOAPIC_UNLOCK(vioapic) mutex_exit(&((vioapic)->lock)) +#define VIOAPIC_LOCKED(vioapic) MUTEX_HELD(&((vioapic)->lock)) + +static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) + +#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + VERIFY(pin >= 0 && pin < REDIR_ENTRIES); + ASSERT(VIOAPIC_LOCKED(vioapic)); + + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) { + if ((low & IOART_REM_IRR) != 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: irr pending", + pin); + return; + } + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + } + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); + vioapic->stats.is_interrupts++; +} + +static int +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + uint_t oldcnt, newcnt; + bool needintr = false; + int err = 0; + + VERIFY(pin >= 0 && pin < REDIR_ENTRIES); + ASSERT(VIOAPIC_LOCKED(vioapic)); + + oldcnt = newcnt = vioapic->rtbl[pin].acnt; + if (newstate) { + if (newcnt != UINT_MAX) { + newcnt++; + } else { + err = E2BIG; + DTRACE_PROBE2(vioapic__sat_high, + struct vioapic *, vioapic, int, pin); + vioapic->stats.is_saturate_high++; + } + } else { + if (newcnt != 0) { + newcnt--; + } else { + err = ERANGE; + DTRACE_PROBE2(vioapic__sat_low, + struct vioapic *, vioapic, int, pin); + vioapic->stats.is_saturate_low++; + } + } + vioapic->rtbl[pin].acnt = newcnt; + + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + DTRACE_PROBE2(vioapic__assert, struct vioapic *, vioapic, + int, pin); + } else if (oldcnt == 1 && newcnt == 0) { + DTRACE_PROBE2(vioapic__deassert, struct vioapic *, vioapic, + int, pin); + } + + if (needintr) { + vioapic_send_intr(vioapic, pin); + } + return (err); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + int err = 0; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + err = vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + err = vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + err = vioapic_set_pinstate(vioapic, irq, true); + if (err == 0) { + err = vioapic_set_pinstate(vioapic, irq, false); + } + break; + default: + panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); + } + VIOAPIC_UNLOCK(vioapic); + + return (err); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + break; + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + break; + case IOAPIC_ARB: + return (vioapic->id); + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return (vioapic->rtbl[pin].reg >> rshift); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) +{ + uint64_t data64, mask64; + int regnum, pin, lshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + /* + * Switching from level to edge triggering will clear the IRR + * bit. This is what FreeBSD will do in order to EOI an + * interrupt when the IO-APIC doesn't support targeted EOI (see + * _ioapic_eoi_source). + */ + if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGREDG && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) != 0) + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", + pin, vioapic->rtbl[pin].reg); + + /* + * Generate an interrupt if the following conditions are met: + * - pin trigger mode is level + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGRLVL && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + *data); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); + return (error); +} + +int +vioapic_mmio_write(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, int vcpuid, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof (struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); + + vioapic->vm = vm; + mutex_init(&vioapic->lock, NULL, MUTEX_ADAPTIVE, NULL); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + mutex_destroy(&vioapic->lock); + free(vioapic, M_VIOAPIC); +} + +int +vioapic_pincount(struct vm *vm) +{ + + return (REDIR_ENTRIES); +} diff --git a/usr/src/uts/intel/io/vmm/io/vioapic.h b/usr/src/uts/intel/io/vmm/io/vioapic.h new file mode 100644 index 0000000000..3c74bd5170 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vioapic.h @@ -0,0 +1,64 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VIOAPIC_H_ +#define _VIOAPIC_H_ + +#define VIOAPIC_BASE 0xFEC00000 +#define VIOAPIC_SIZE 4096 + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size); +int vioapic_mmio_read(struct vm *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#endif diff --git a/usr/src/uts/intel/io/vmm/io/vlapic.c b/usr/src/uts/intel/io/vmm/io/vlapic.c new file mode 100644 index 0000000000..1cd6c72aaf --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vlapic.c @@ -0,0 +1,1841 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/cpuset.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/clock.h> + +#include <machine/vmm.h> +#include <sys/vmm_kernel.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vlapic.h" +#include "vlapic_priv.h" +#include "vioapic.h" + + +/* + * The 4 high bits of a given interrupt vector represent its priority. The same + * is true for the contents of the TPR when it is used to calculate the ultimate + * PPR of an APIC - the 4 high bits hold the priority. + */ +#define PRIO(x) ((x) & 0xf0) + +#define VLAPIC_VERSION (16) + +/* + * The 'vlapic->timer_lock' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK(vlapic) mutex_enter(&((vlapic)->timer_lock)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mutex_exit(&((vlapic)->timer_lock)) +#define VLAPIC_TIMER_LOCKED(vlapic) MUTEX_HELD(&((vlapic)->timer_lock)) + +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when calculating times + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +#define APICBASE_ADDR_MASK 0xfffffffffffff000UL + +static void vlapic_set_error(struct vlapic *, uint32_t, bool); +static void vlapic_callout_handler(void *arg); + +#ifdef __ISRVEC_DEBUG +static void vlapic_isrstk_accept(struct vlapic *, int); +static void vlapic_isrstk_eoi(struct vlapic *, int); +static void vlapic_isrstk_verify(const struct vlapic *); +#endif /* __ISRVEC_DEBUG */ + + +static __inline bool +vlapic_x2mode(const struct vlapic *vlapic) +{ + return ((vlapic->msr_apicbase & APICBASE_X2APIC) != 0); +} + +static __inline bool +vlapic_hw_disabled(const struct vlapic *vlapic) +{ + return ((vlapic->msr_apicbase & APICBASE_ENABLED) == 0); +} + +static __inline bool +vlapic_sw_disabled(const struct vlapic *vlapic) +{ + const struct LAPIC *lapic = vlapic->apic_page; + + return ((lapic->svr & APIC_SVR_ENABLE) == 0); +} + +static __inline bool +vlapic_enabled(const struct vlapic *vlapic) +{ + return (!vlapic_hw_disabled(vlapic) && !vlapic_sw_disabled(vlapic)); +} + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (vlapic_x2mode(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (vlapic_x2mode(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (vlapic_x2mode(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + + const hrtime_t now = gethrtime(); + if (vlapic->timer_fire_when > now) { + ccr += hrt_freq_count(vlapic->timer_fire_when - now, + vlapic->timer_cur_freq); + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, " + "icr_timer is %x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + vlapic->timer_cur_freq = VLAPIC_BUS_FREQ / divisor; + vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq, + lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +vcpu_notify_t +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask, tmr; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + /* ignore interrupt on software-disabled APIC */ + return (VCPU_NOTIFY_NONE); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, + false); + + /* + * If the error LVT is configured to interrupt the vCPU, it will + * have delivered a notification through that mechanism. + */ + return (VCPU_NOTIFY_NONE); + } + + if (vlapic->ops.set_intr_ready) { + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + } + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + tmrptr = &lapic->tmr0; + irrptr = &lapic->irr0; + + /* + * Update TMR for requested vector, if necessary. + * This must be done prior to asserting the bit in IRR so that the + * proper TMR state is always visible before the to-be-queued interrupt + * can be injected. + */ + tmr = atomic_load_acq_32(&tmrptr[idx]); + if ((tmr & mask) != (level ? mask : 0)) { + if (level) { + atomic_set_int(&tmrptr[idx], mask); + } else { + atomic_clear_int(&tmrptr[idx], mask); + } + } + + /* Now set the bit in IRR */ + atomic_set_int(&irrptr[idx], mask); + + return (VCPU_NOTIFY_EXIT); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i); + default: + panic("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %x", index, offset)); + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, uint_t lvt) +{ + uint32_t mode, reg, vec; + vcpu_notify_t notify; + + reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); + + if (reg & APIC_LVT_M) + return (0); + vec = reg & APIC_LVT_VECTOR; + mode = reg & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + lvt == APIC_LVT_ERROR); + return (0); + } + notify = vlapic_set_intr_ready(vlapic, vec, false); + vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +static uint_t +vlapic_active_isr(struct vlapic *vlapic) +{ + int i; + uint32_t *isrp; + + isrp = &vlapic->apic_page->isr7; + + for (i = 7; i >= 0; i--, isrp -= 4) { + uint32_t reg = *isrp; + + if (reg != 0) { + uint_t vec = (i * 32) + bsrl(reg); + + if (vec < 16) { + /* + * Truncate the illegal low vectors to value of + * 0, indicating that no active ISR was found. + */ + return (0); + } + return (vec); + } + } + + return (0); +} + +/* + * After events which might arbitrarily change the value of PPR, such as a TPR + * write or an EOI, calculate that new PPR value and store it in the APIC page. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + isrvec = vlapic_active_isr(vlapic); + tpr = vlapic->apic_page->tpr; + + /* + * Algorithm adopted from section "Interrupt, Task and Processor + * Priority" in Intel Architecture Manual Vol 3a. + */ + if (PRIO(tpr) >= PRIO(isrvec)) { + ppr = tpr; + } else { + ppr = PRIO(isrvec); + } + + vlapic->apic_page->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +/* + * When a vector is asserted in ISR as in-service, the PPR must be raised to the + * priority of that vector, as the vCPU would have been at a lower priority in + * order for the vector to be accepted. + */ +static void +vlapic_raise_ppr(struct vlapic *vlapic, int vec) +{ + struct LAPIC *lapic = vlapic->apic_page; + int ppr; + + ppr = PRIO(vec); + +#ifdef __ISRVEC_DEBUG + KASSERT(vec >= 16 && vec < 256, ("invalid vector %d", vec)); + KASSERT(ppr > lapic->tpr, ("ppr %x <= tpr %x", ppr, lapic->tpr)); + KASSERT(ppr > lapic->ppr, ("ppr %x <= old ppr %x", ppr, lapic->ppr)); + KASSERT(vec == (int)vlapic_active_isr(vlapic), ("ISR missing for ppr")); +#endif /* __ISRVEC_DEBUG */ + + lapic->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +void +vlapic_sync_tpr(struct vlapic *vlapic) +{ + vlapic_update_ppr(vlapic); +} + +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i; + uint_t idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + if (isrptr[idx] != 0) { + bitpos = bsrl(isrptr[idx]); + vector = i * 32 + bitpos; + + isrptr[idx] &= ~(1 << bitpos); + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); +#ifdef __ISRVEC_DEBUG + vlapic_isrstk_eoi(vlapic, vector); +#endif + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, + vector); + } + return; + } + } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return (lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +static void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) +{ + + vlapic->esr_pending |= mask; + + /* + * Avoid infinite recursion if the error LVT itself is configured with + * an illegal vector. + */ + if (lvt_error) + return; + + if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + ASSERT(VLAPIC_TIMER_LOCKED(vlapic)); + + if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + + if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + if (!vlapic_enabled(vlapic)) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + case APIC_LVT_LINT1: + case APIC_LVT_TIMER: + case APIC_LVT_ERROR: + case APIC_LVT_PMC: + case APIC_LVT_THERMAL: + case APIC_LVT_CMCI: + if (vlapic_fire_lvt(vlapic, vector)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + break; + default: + return (EINVAL); + } + return (0); +} + +static void +vlapic_callout_reset(struct vlapic *vlapic) +{ + callout_reset_hrtime(&vlapic->callout, vlapic->timer_fire_when, + vlapic_callout_handler, vlapic, C_ABSOLUTE); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. We can depend on the fact that + * cyclics (which underly these callouts) will never be called + * early. + */ + const hrtime_t now = gethrtime(); + const hrtime_t delta = now - vlapic->timer_fire_when; + if (delta >= vlapic->timer_period) { + /* + * If we are so behind that we have missed an entire + * timer period, reset the time base rather than + * attempting to catch up. + */ + vlapic->timer_fire_when = now + vlapic->timer_period; + } else { + vlapic->timer_fire_when += vlapic->timer_period; + } + vlapic_callout_reset(vlapic); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq, + lapic->icr_timer); + if (vlapic->timer_period != 0) { + vlapic->timer_fire_when = gethrtime() + vlapic->timer_period; + vlapic_callout_reset(vlapic); + } else { + vlapic->timer_fire_when = 0; + callout_stop(&vlapic->callout); + } + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + amask = vm_active_cpus(vm); + if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only available in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in its LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (vlapic_x2mode(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu"); +static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " + "from %#x to %#x", lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~0xf) { + vm_inject_gp(vlapic->vm, vlapic->vcpuid); + return; + } + + tpr = val << 4; + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + const struct LAPIC *lapic = vlapic->apic_page; + + return (lapic->tpr >> 4); +} + +void +vlapic_icrlo_write_handler(struct vlapic *vlapic) +{ + int i; + cpuset_t dmask; + uint64_t icrval; + uint32_t dest, vec, mode, dsh; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (vlapic_x2mode(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + dsh = icrval & APIC_DEST_MASK; + + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); + return; + } + if (mode == APIC_DELMODE_INIT && + (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) { + /* No work required to deassert INIT */ + return; + } + if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) && + !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) { + /* + * While Intel makes no mention of restrictions for destination + * shorthand when sending INIT or SIPI, AMD requires either a + * specific destination or all-excluding self. Common use seems + * to be restricted to those two cases. Until handling is in + * place to halt a guest which makes such a frivolous request, + * we will ignore them. + */ + return; + } + + switch (dsh) { + case APIC_DEST_DESTFLD: + vlapic_calcdest(vlapic->vm, &dmask, dest, + (icrval & APIC_DESTMODE_LOG) == 0, false, + vlapic_x2mode(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + /* + * All possible delivery notations are covered above. + * We should never end up here. + */ + panic("unknown delivery shorthand: %x", dsh); + } + + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + switch (mode) { + case APIC_DELMODE_FIXED: + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, + VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vlapic->vm, i, + VLAPIC_IPI_RECV, 1); + break; + case APIC_DELMODE_NMI: + vm_inject_nmi(vlapic->vm, i); + break; + case APIC_DELMODE_INIT: + (void) vm_inject_init(vlapic->vm, i); + break; + case APIC_DELMODE_STARTUP: + (void) vm_inject_sipi(vlapic->vm, i, vec); + break; + case APIC_DELMODE_LOWPRIO: + case APIC_DELMODE_SMI: + default: + /* Unhandled IPI modes (for now) */ + break; + } + } +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val) +{ + const int vec = val & 0xff; + + /* self-IPI is only exposed via x2APIC */ + ASSERT(vlapic_x2mode(vlapic)); + + lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_RECV, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + if (vlapic->ops.sync_state) { + (*vlapic->ops.sync_state)(vlapic); + } + + irrptr = &lapic->irr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx; + + KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector)); + + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * The only way a fresh vector could be accepted into ISR is if it was + * of a higher priority than the current PPR. With that vector now + * in-service, the PPR must be raised. + */ + vlapic_raise_ppr(vlapic, vector); + +#ifdef __ISRVEC_DEBUG + vlapic_isrstk_accept(vlapic, vector); +#endif +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +static bool +vlapic_read(struct vlapic *vlapic, uint16_t offset, uint32_t *outp) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + ASSERT3U(offset & 0x3, ==, 0); + ASSERT3U(offset, <, PAGESIZE); + ASSERT3P(outp, !=, NULL); + + uint32_t data = 0; + switch (offset) { + case APIC_OFFSET_ID: + data = lapic->id; + break; + case APIC_OFFSET_VER: + data = lapic->version; + break; + case APIC_OFFSET_TPR: + data = lapic->tpr; + break; + case APIC_OFFSET_APR: + data = lapic->apr; + break; + case APIC_OFFSET_PPR: + data = lapic->ppr; + break; + case APIC_OFFSET_LDR: + data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + data = lapic->icr_lo; + break; + case APIC_OFFSET_ICR_HI: + data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + ASSERT3U(data, ==, *reg); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + data = lapic->dcr_timer; + break; + case APIC_OFFSET_RRR: + data = 0; + break; + + case APIC_OFFSET_SELF_IPI: + case APIC_OFFSET_EOI: + /* Write-only register */ + *outp = 0; + return (false); + + default: + /* Invalid register */ + *outp = 0; + return (false); + } + + *outp = data; + return (true); +} + +static bool +vlapic_write(struct vlapic *vlapic, uint16_t offset, uint32_t data) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + + ASSERT3U(offset & 0xf, ==, 0); + ASSERT3U(offset, <, PAGESIZE); + + switch (offset) { + case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = data; + vlapic_icrlo_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (vlapic_x2mode(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + /* Read-only register */ + return (false); + + default: + /* Invalid register */ + return (false); + } + + return (true); +} + +void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr, *irrptr; + + /* Reset any timer-related state first */ + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + lapic->icr_timer = 0; + lapic->ccr_timer = 0; + VLAPIC_TIMER_UNLOCK(vlapic); + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + /* + * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so + * it is not leftover after the reset. This is performed after the APIC + * timer has been stopped, in case it happened to fire just prior to + * being deactivated. + */ + if (vlapic->ops.sync_state) { + (*vlapic->ops.sync_state)(vlapic); + } + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + + lapic->tpr = 0; + lapic->apr = 0; + lapic->ppr = 0; + +#ifdef __ISRVEC_DEBUG + /* With the PPR cleared, the isrvec tracking should be reset too */ + vlapic->isrvec_stk_top = 0; +#endif + + lapic->eoi = 0; + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic->svr_last = lapic->svr; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + irrptr = &lapic->irr0; + for (uint_t i = 0; i < 8; i++) { + atomic_store_rel_int(&isrptr[i * 4], 0); + atomic_store_rel_int(&tmrptr[i * 4], 0); + atomic_store_rel_int(&irrptr[i * 4], 0); + } + + lapic->esr = 0; + vlapic->esr_pending = 0; + lapic->icr_lo = 0; + lapic->icr_hi = 0; + + lapic->lvt_cmci = 0; + lapic->lvt_timer = 0; + lapic->lvt_thermal = 0; + lapic->lvt_pcint = 0; + lapic->lvt_lint0 = 0; + lapic->lvt_lint1 = 0; + lapic->lvt_error = 0; + vlapic_mask_lvts(vlapic); +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && + vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mutex_init(&vlapic->timer_lock, NULL, MUTEX_ADAPTIVE, NULL); + callout_init(&vlapic->callout, 1); + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + callout_drain(&vlapic->callout); + mutex_destroy(&vlapic->timer_lock); +} + +int +vlapic_mmio_read(struct vlapic *vlapic, uint64_t gpa, uint64_t *valp, + uint_t size) +{ + ASSERT3U(gpa, >=, DEFAULT_APIC_BASE); + ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE); + + /* Ignore MMIO accesses when in x2APIC mode or hardware disabled */ + if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) { + *valp = UINT64_MAX; + return (0); + } + + const uint16_t off = gpa - DEFAULT_APIC_BASE; + uint32_t raw = 0; + (void) vlapic_read(vlapic, off & ~0xf, &raw); + + /* Shift and mask reads which are small and/or unaligned */ + const uint8_t align = off & 0xf; + if (align < 4) { + *valp = (uint64_t)raw << (align * 8); + } else { + *valp = 0; + } + + return (0); +} + +int +vlapic_mmio_write(struct vlapic *vlapic, uint64_t gpa, uint64_t val, + uint_t size) +{ + ASSERT3U(gpa, >=, DEFAULT_APIC_BASE); + ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE); + + /* Ignore MMIO accesses when in x2APIC mode or hardware disabled */ + if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) { + return (0); + } + + const uint16_t off = gpa - DEFAULT_APIC_BASE; + /* Ignore writes which are not 32-bits wide and 16-byte aligned */ + if ((off & 0xf) != 0 || size != 4) { + return (0); + } + + (void) vlapic_write(vlapic, off, (uint32_t)val); + return (0); +} + +/* Should attempts to change the APIC base address be rejected with a #GP? */ +int vlapic_gp_on_addr_change = 1; + +static vm_msr_result_t +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val) +{ + const uint64_t diff = vlapic->msr_apicbase ^ val; + + /* + * Until the LAPIC emulation for switching between xAPIC and x2APIC + * modes is more polished, it will remain off-limits from being altered + * by the guest. + */ + const uint64_t reserved_bits = APICBASE_RESERVED | APICBASE_X2APIC | + APICBASE_BSP; + if ((diff & reserved_bits) != 0) { + return (VMR_GP); + } + + /* We do not presently allow the LAPIC access address to be modified. */ + if ((diff & APICBASE_ADDR_MASK) != 0) { + /* + * Explicitly rebuffing such requests with a #GP is the most + * straightforward way to handle the situation, but certain + * consumers (such as the KVM unit tests) may balk at the + * otherwise unexpected exception. + */ + if (vlapic_gp_on_addr_change) { + return (VMR_GP); + } + + /* If silence is required, just ignore the address change. */ + val = (val & ~APICBASE_ADDR_MASK) | DEFAULT_APIC_BASE; + } + + vlapic->msr_apicbase = val; + return (VMR_OK); +} + +static __inline uint16_t +vlapic_msr_to_regoff(uint32_t msr) +{ + ASSERT3U(msr, >=, MSR_APIC_000); + ASSERT3U(msr, <, (MSR_APIC_000 + 0x100)); + + return ((msr - MSR_APIC_000) << 4); +} + +bool +vlapic_owned_msr(uint32_t msr) +{ + if (msr == MSR_APICBASE) { + return (true); + } + if (msr >= MSR_APIC_000 && + msr < (MSR_APIC_000 + 0x100)) { + return (true); + } + return (false); +} + +vm_msr_result_t +vlapic_rdmsr(struct vlapic *vlapic, uint32_t msr, uint64_t *valp) +{ + ASSERT(vlapic_owned_msr(msr)); + ASSERT3P(valp, !=, NULL); + + if (msr == MSR_APICBASE) { + *valp = vlapic->msr_apicbase; + return (VMR_OK); + } + + /* #GP for x2APIC MSR accesses in xAPIC mode */ + if (!vlapic_x2mode(vlapic)) { + return (VMR_GP); + } + + uint64_t out = 0; + const uint16_t reg = vlapic_msr_to_regoff(msr); + switch (reg) { + case APIC_OFFSET_ICR_LOW: { + /* Read from ICR register gets entire (64-bit) value */ + uint32_t low = 0, high = 0; + bool valid; + + valid = vlapic_read(vlapic, APIC_OFFSET_ICR_HI, &high); + VERIFY(valid); + valid = vlapic_read(vlapic, APIC_OFFSET_ICR_LOW, &low); + VERIFY(valid); + + *valp = ((uint64_t)high << 32) | low; + return (VMR_OK); + } + case APIC_OFFSET_ICR_HI: + /* Already covered by ICR_LOW */ + return (VMR_GP); + default: + break; + } + if (!vlapic_read(vlapic, reg, (uint32_t *)&out)) { + return (VMR_GP); + } + *valp = out; + return (VMR_OK); +} + +vm_msr_result_t +vlapic_wrmsr(struct vlapic *vlapic, uint32_t msr, uint64_t val) +{ + ASSERT(vlapic_owned_msr(msr)); + + if (msr == MSR_APICBASE) { + return (vlapic_set_apicbase(vlapic, val)); + } + + /* #GP for x2APIC MSR accesses in xAPIC mode */ + if (!vlapic_x2mode(vlapic)) { + return (VMR_GP); + } + + const uint16_t reg = vlapic_msr_to_regoff(msr); + switch (reg) { + case APIC_OFFSET_ICR_LOW: { + /* Write to ICR register sets entire (64-bit) value */ + bool valid; + + valid = vlapic_write(vlapic, APIC_OFFSET_ICR_HI, val >> 32); + VERIFY(valid); + valid = vlapic_write(vlapic, APIC_OFFSET_ICR_LOW, val); + VERIFY(valid); + return (VMR_OK); + } + case APIC_OFFSET_ICR_HI: + /* Already covered by ICR_LOW */ + return (VMR_GP); + case APIC_OFFSET_ESR: + /* Only 0 may be written from x2APIC mode */ + if (val != 0) { + return (VMR_GP); + } + break; + default: + break; + } + if (!vlapic_write(vlapic, reg, val)) { + return (VMR_GP); + } + return (VMR_OK); +} + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (vlapic_x2mode(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &dmask); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vm, vcpuid); + } else { + lapic_set_intr(vm, vcpuid, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + poke_cpu(hostcpu); +} + +void +vlapic_localize_resources(struct vlapic *vlapic) +{ + vmm_glue_callout_localize(&vlapic->callout); +} + +#ifdef __ISRVEC_DEBUG +static void +vlapic_isrstk_eoi(struct vlapic *vlapic, int vector) +{ + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + vlapic->isrvec_stk_top--; + vlapic_isrstk_verify(vlapic); +} + +static void +vlapic_isrstk_accept(struct vlapic *vlapic, int vector) +{ + int stk_top; + + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_isrstk_verify(vlapic); +} + +static void +vlapic_isrstk_dump(const struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} + +static void +vlapic_isrstk_verify(const struct vlapic *vlapic) +{ + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + /* + * Note: The value at index 0 in isrvec_stk is always 0. + * + * It is a placeholder for the value of ISR vector when no bits are set + * in the ISRx registers. + */ + if (vlapic->isrvec_stk_top == 0 && vlapic->isrvec_stk[0] != 0) { + panic("isrvec_stk is corrupted: %d", vlapic->isrvec_stk[0]); + } + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + vlapic_isrstk_dump(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + vlapic_isrstk_dump(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } +} +#endif diff --git a/usr/src/uts/intel/io/vmm/io/vlapic.h b/usr/src/uts/intel/io/vmm/io/vlapic.h new file mode 100644 index 0000000000..dd1970cb6a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vlapic.h @@ -0,0 +1,108 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +void vlapic_reset(struct vlapic *vlapic); + +int vlapic_mmio_write(struct vlapic *, uint64_t, uint64_t, uint_t); +int vlapic_mmio_read(struct vlapic *, uint64_t, uint64_t *, uint_t); + +bool vlapic_owned_msr(uint32_t); +vm_msr_result_t vlapic_rdmsr(struct vlapic *, uint32_t, uint64_t *); +vm_msr_result_t vlapic_wrmsr(struct vlapic *, uint32_t, uint64_t); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +vcpu_notify_t vlapic_set_intr_ready(struct vlapic *vlapic, int vector, + bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an IPI to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu); + +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +void vlapic_sync_tpr(struct vlapic *vlapic); + +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +void vlapic_icrlo_write_handler(struct vlapic *vlapic); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val); + +void vlapic_localize_resources(struct vlapic *vlapic); + +#endif /* _VLAPIC_H_ */ diff --git a/usr/src/uts/intel/io/vmm/io/vlapic_priv.h b/usr/src/uts/intel/io/vmm/io/vlapic_priv.h new file mode 100644 index 0000000000..7b12b60f51 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vlapic_priv.h @@ -0,0 +1,207 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include <x86/apicreg.h> + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +#define VLAPIC_TMR_CNT 8 + +#ifdef DEBUG +#define __ISRVEC_DEBUG +#endif + +struct vlapic; + +struct vlapic_ops { + vcpu_notify_t (*set_intr_ready)(struct vlapic *vlapic, int vector, + bool level); + void (*sync_state)(struct vlapic *vlapic); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + + struct callout callout; /* vlapic timer */ + hrtime_t timer_fire_when; + hrtime_t timer_period; + uint32_t timer_cur_freq; + + kmutex_t timer_lock; + + uint64_t msr_apicbase; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + +#ifdef __ISRVEC_DEBUG + /* + * The 'isrvec_stk' is a stack of vectors injected by the local APIC. + * It is used as a debugging method to double-check the behavior of the + * emulation. Vectors are pushed to the stack when they are accepted + * for injection and popped from the stack when the processor performs + * an EOI. The vector on the top of the stack is used to verify the + * computed Processor Priority. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; +#endif +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/usr/src/uts/intel/io/vmm/io/vpmtmr.c b/usr/src/uts/intel/io/vmm/io/vpmtmr.c new file mode 100644 index 0000000000..9a7d7d4253 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vpmtmr.c @@ -0,0 +1,159 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + struct vm *vm; + void *io_cookie; + uint16_t io_port; + hrtime_t base_time; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + + vpmtmr = malloc(sizeof (struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->vm = vm; + vpmtmr->base_time = gethrtime(); + + return (vpmtmr); +} + +static int +vpmtmr_detach_ioport(struct vpmtmr *vpmtmr) +{ + if (vpmtmr->io_cookie != NULL) { + ioport_handler_t old_func; + void *old_arg; + int err; + + err = vm_ioport_detach(vpmtmr->vm, &vpmtmr->io_cookie, + &old_func, &old_arg); + if (err != 0) { + return (err); + } + + ASSERT3P(old_func, ==, vpmtmr_handler); + ASSERT3P(old_arg, ==, vpmtmr); + ASSERT3P(vpmtmr->io_cookie, ==, NULL); + vpmtmr->io_port = 0; + } + return (0); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + int err; + + err = vpmtmr_detach_ioport(vpmtmr); + VERIFY3P(err, ==, 0); + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_set_location(struct vm *vm, uint16_t ioport) +{ + struct vpmtmr *vpmtmr = vm_pmtmr(vm); + int err; + + if (vpmtmr->io_cookie != NULL) { + if (vpmtmr->io_port == ioport) { + /* already attached in the right place */ + return (0); + } + + err = vpmtmr_detach_ioport(vpmtmr); + VERIFY3P(err, ==, 0); + } + err = vm_ioport_attach(vm, ioport, vpmtmr_handler, vpmtmr, + &vpmtmr->io_cookie); + if (err == 0) { + vpmtmr->io_port = ioport; + } + + return (err); +} + +int +vpmtmr_handler(void *arg, bool in, uint16_t port, uint8_t bytes, uint32_t *val) +{ + struct vpmtmr *vpmtmr = arg; + + if (!in || bytes != 4) + return (-1); + + /* + * No locking needed because 'base_time' is written only during + * initialization. + */ + const hrtime_t delta = gethrtime() - vpmtmr->base_time; + ASSERT3S(delta, >=, 0); + + *val = hrt_freq_count(delta, PMTMR_FREQ); + + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/io/vpmtmr.h b/usr/src/uts/intel/io/vmm/io/vpmtmr.h new file mode 100644 index 0000000000..0451da0350 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vpmtmr.h @@ -0,0 +1,58 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_set_location(struct vm *, uint16_t); + +int vpmtmr_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val); + +#endif diff --git a/usr/src/uts/intel/io/vmm/io/vrtc.c b/usr/src/uts/intel/io/vmm/io/vrtc.c new file mode 100644 index 0000000000..185cbc100a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vrtc.c @@ -0,0 +1,1011 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/clock.h> +#include <sys/sysctl.h> + +#include <machine/vmm.h> + +#include <isa/rtc.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof (struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + kmutex_t lock; + struct callout callout; + uint_t addr; /* RTC register to read or write */ + hrtime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mutex_enter(&((vrtc)->lock)) +#define VRTC_UNLOCK(vrtc) mutex_exit(&((vrtc)->lock)) +#define VRTC_LOCKED(vrtc) MUTEX_HELD(&((vrtc)->lock)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +/* Stop guest when invalid RTC time is detected */ +static int rtc_flag_broken_time = 1; + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, hrtime_t *basetime) +{ + time_t t = vrtc->base_rtctime; + hrtime_t base = vrtc->base_uptime; + + ASSERT(VRTC_LOCKED(vrtc)); + + if (update_enabled(vrtc)) { + const hrtime_t delta = gethrtime() - vrtc->base_uptime; + const time_t sec = delta / NANOSEC; + + ASSERT3S(delta, >=, 0); + + t += sec; + base += sec * NANOSEC; + } + if (basetime != NULL) { + *basetime = base; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + ASSERT(VRTC_LOCKED(vrtc)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int century, error, hour, pm, year; + + ASSERT(VRTC_LOCKED(vrtc)); + + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof (struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { + /* invalid RTC seconds */ + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { + /* invalid RTC minutes */ + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { + /* invalid RTC 12-hour format */ + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { + /* invalid RTC hour */ + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { + /* invalid RTC mday */ + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { + /* invalid RTC month */ + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { + /* invalid RTC year */ + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { + /* invalid RTC century */ + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { + /* invalid RTC clocktime */ + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, hrtime_t newbase) +{ + struct rtcdev *rtc; + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + ASSERT(VRTC_LOCKED(vrtc)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %lx to %lx", + oldtime, newtime); + + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static hrtime_t +vrtc_freq(struct vrtc *vrtc) +{ + const hrtime_t rate_freq[16] = { + 0, + NANOSEC / 256, + NANOSEC / 128, + NANOSEC / 8192, + NANOSEC / 4096, + NANOSEC / 2048, + NANOSEC / 1024, + NANOSEC / 512, + NANOSEC / 256, + NANOSEC / 128, + NANOSEC / 64, + NANOSEC / 32, + NANOSEC / 16, + NANOSEC / 8, + NANOSEC / 4, + NANOSEC / 2, + }; + + ASSERT(VRTC_LOCKED(vrtc)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + uint_t sel = vrtc->rtcdev.reg_a & 0xf; + return (rate_freq[sel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (NANOSEC); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (NANOSEC); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, hrtime_t freqhrt) +{ + + ASSERT(VRTC_LOCKED(vrtc)); + + if (freqhrt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", NANOSEC / freqhrt); + callout_reset_hrtime(&vrtc->callout, freqhrt, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + hrtime_t basetime; + + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + hrtime_t freqhrt = vrtc_freq(vrtc); + KASSERT(freqhrt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqhrt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, hrtime_t freqhrt) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freqhrt == 0 && !active) || (freqhrt != 0 && active), + ("vrtc callout %s with frequency %llx", + active ? "active" : "inactive", NANOSEC / freqhrt)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + ASSERT(VRTC_LOCKED(vrtc)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %x to %x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + hrtime_t oldfreq, newfreq; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + ASSERT(VRTC_LOCKED(vrtc)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %x to %x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + hrtime_t basetime; + + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = gethrtime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%lx) and curtime (%lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + hrtime_t oldfreq, newfreq; + uint8_t oldval, changed; + + ASSERT(VRTC_LOCKED(vrtc)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %lx/%lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = gethrtime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %lx/%lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %x to %x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, gethrtime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, NULL); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof (struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %x to offset %x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof (struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, NULL); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val) +{ + struct vrtc *vrtc = arg; + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val) +{ + struct vrtc *vrtc = arg; + struct rtcdev *rtc = &vrtc->rtcdev; + hrtime_t basetime; + time_t curtime; + int error, offset; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof (struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VM_CTR2(vm, "Read value %x from RTC offset %x", + *val, offset); + } else { + switch (offset) { + case 10: + VM_CTR1(vm, "RTC reg_a set to %x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VM_CTR1(vm, "RTC reg_b set to %x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VM_CTR1(vm, "RTC reg_c set to %x (ignored)", + *val); + break; + case 13: + VM_CTR1(vm, "RTC reg_d set to %x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VM_CTR2(vm, "RTC offset %x set to %x", offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, gethrtime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof (struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mutex_init(&vrtc->lock, NULL, MUTEX_ADAPTIVE, NULL); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, gethrtime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + callout_drain(&vrtc->callout); + mutex_destroy(&vrtc->lock); + free(vrtc, M_VRTC); +} + +void +vrtc_localize_resources(struct vrtc *vrtc) +{ + vmm_glue_callout_localize(&vrtc->callout); +} diff --git a/usr/src/uts/intel/io/vmm/io/vrtc.h b/usr/src/uts/intel/io/vmm/io/vrtc.h new file mode 100644 index 0000000000..d3140c1308 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/io/vrtc.h @@ -0,0 +1,58 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include <isa/isareg.h> + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val); +int vrtc_data_handler(void *arg, bool in, uint16_t port, uint8_t bytes, + uint32_t *val); + +void vrtc_localize_resources(struct vrtc *); + +#endif diff --git a/usr/src/uts/intel/io/vmm/seg_vmm.c b/usr/src/uts/intel/io/vmm/seg_vmm.c new file mode 100644 index 0000000000..863b283418 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/seg_vmm.c @@ -0,0 +1,569 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +/* + * segvmm - Virtual-Machine-Memory segment + * + * The vmm segment driver was designed for mapping regions of kernel memory + * allocated to an HVM instance into userspace for manipulation there. It + * draws direct lineage from the umap segment driver, but meant for larger + * mappings with fewer restrictions. + * + * seg*k*vmm, in contrast, has mappings for every VMM into kas. We use its + * mappings here only to find the relevant PFNs in segvmm_fault_in(). + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/hat_pte.h> +#include <vm/htable.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> + +#include <sys/seg_vmm.h> + +typedef struct segvmm_data { + krwlock_t svmd_lock; + vm_object_t *svmd_vmo; + vm_client_t *svmd_vmc; + uintptr_t svmd_off; + uchar_t svmd_prot; + size_t svmd_softlockcnt; +} segvmm_data_t; + + +static int segvmm_dup(struct seg *, struct seg *); +static int segvmm_unmap(struct seg *, caddr_t, size_t); +static void segvmm_free(struct seg *); +static faultcode_t segvmm_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segvmm_faulta(struct seg *, caddr_t); +static int segvmm_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segvmm_incore(struct seg *, caddr_t, size_t, char *); +static int segvmm_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int segvmm_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segvmm_getoffset(struct seg *, caddr_t); +static int segvmm_gettype(struct seg *, caddr_t); +static int segvmm_getvp(struct seg *, caddr_t, struct vnode **); +static int segvmm_advise(struct seg *, caddr_t, size_t, uint_t); +static void segvmm_dump(struct seg *); +static int segvmm_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int segvmm_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segvmm_getmemid(struct seg *, caddr_t, memid_t *); +static int segvmm_capable(struct seg *, segcapability_t); + +static struct seg_ops segvmm_ops = { + .dup = segvmm_dup, + .unmap = segvmm_unmap, + .free = segvmm_free, + .fault = segvmm_fault, + .faulta = segvmm_faulta, + .setprot = segvmm_setprot, + .checkprot = segvmm_checkprot, + .kluster = NULL, + .swapout = NULL, + .sync = segvmm_sync, + .incore = segvmm_incore, + .lockop = segvmm_lockop, + .getprot = segvmm_getprot, + .getoffset = segvmm_getoffset, + .gettype = segvmm_gettype, + .getvp = segvmm_getvp, + .advise = segvmm_advise, + .dump = segvmm_dump, + .pagelock = segvmm_pagelock, + .setpagesize = segvmm_setpagesize, + .getmemid = segvmm_getmemid, + .getpolicy = NULL, + .capable = segvmm_capable, + .inherit = seg_inherit_notsup +}; + +/* + * Unload a region from the HAT for A/D tracking. + */ +static void +segvmm_invalidate(void *arg, uintptr_t gpa, size_t sz) +{ + struct seg *seg = arg; + segvmm_data_t *svmd = seg->s_data; + + /* + * Invalidations are only necessary (and configured) for vmspace + * mappings. Direct vm_object mappings are not involved. + */ + ASSERT3P(svmd->svmd_vmo, ==, NULL); + + /* + * The region being invalidated may overlap with all, some, or none of + * this segment. We are only concerned about that overlap. + */ + const uintptr_t start = MAX(gpa, svmd->svmd_off); + const uintptr_t end = MIN(gpa + sz, svmd->svmd_off + seg->s_size); + if (start >= end) { + return; + } + ASSERT(start >= svmd->svmd_off && end <= svmd->svmd_off + seg->s_size); + ASSERT(start >= gpa && end <= gpa + sz); + const caddr_t unload_va = seg->s_base + (start - svmd->svmd_off); + const size_t unload_sz = (end - start); + ASSERT3U(unload_sz, <=, seg->s_size); + + hat_unload(seg->s_as->a_hat, unload_va, unload_sz, HAT_UNLOAD); +} + +/* + * Create a VMM-memory-backed segment. + */ +int +segvmm_create(struct seg **segpp, void *argsp) +{ + struct seg *seg = *segpp; + segvmm_crargs_t *cra = argsp; + segvmm_data_t *data; + + VERIFY((cra->vmo == NULL && cra->vmc != NULL) || + (cra->vmo != NULL && cra->vmc == NULL)); + VERIFY(cra->prot & PROT_USER); + VERIFY0(cra->offset & PAGEOFFSET); + + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL); + data->svmd_off = cra->offset; + data->svmd_prot = cra->prot & ~PROT_USER; + + seg->s_ops = &segvmm_ops; + seg->s_data = data; + + if (cra->vmo != NULL) { + data->svmd_vmo = cra->vmo; + /* Grab a hold on the VM object for the lifetime of segment */ + vm_object_reference(data->svmd_vmo); + } else { + int err; + + data->svmd_vmc = cra->vmc; + err = vmc_set_inval_cb(data->svmd_vmc, segvmm_invalidate, seg); + if (err != 0) { + seg->s_ops = NULL; + seg->s_data = NULL; + kmem_free(data, sizeof (*data)); + return (err); + } + } + return (0); +} + +static int +segvmm_dup(struct seg *seg, struct seg *newseg) +{ + segvmm_data_t *svmd = seg->s_data; + segvmm_data_t *newsvmd; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP); + rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL); + newsvmd->svmd_off = svmd->svmd_off; + newsvmd->svmd_prot = svmd->svmd_prot; + + newseg->s_ops = seg->s_ops; + newseg->s_data = newsvmd; + + if (svmd->svmd_vmo != NULL) { + /* Grab another hold for the duplicate segment */ + vm_object_reference(svmd->svmd_vmo); + newsvmd->svmd_vmo = svmd->svmd_vmo; + } else { + int err; + + newsvmd->svmd_vmc = vmc_clone(svmd->svmd_vmc); + /* + * The cloned client does not inherit the invalidation + * configuration, so attempt to set it here for the new segment. + */ + err = vmc_set_inval_cb(newsvmd->svmd_vmc, segvmm_invalidate, + newseg); + if (err != 0) { + newseg->s_ops = NULL; + newseg->s_data = NULL; + kmem_free(newsvmd, sizeof (*newsvmd)); + return (err); + } + } + + return (0); +} + +static int +segvmm_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Only allow unmap of entire segment */ + if (addr != seg->s_base || len != seg->s_size) { + return (EINVAL); + } + if (svmd->svmd_softlockcnt != 0) { + return (EAGAIN); + } + + /* Unconditionally unload the entire segment range. */ + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + + seg_free(seg); + return (0); +} + +static void +segvmm_free(struct seg *seg) +{ + segvmm_data_t *svmd = seg->s_data; + + ASSERT(svmd != NULL); + + if (svmd->svmd_vmo != NULL) { + /* Release the VM object hold this segment possessed */ + vm_object_release(svmd->svmd_vmo); + svmd->svmd_vmo = NULL; + } else { + vmc_destroy(svmd->svmd_vmc); + svmd->svmd_vmc = NULL; + } + rw_destroy(&svmd->svmd_lock); + VERIFY(svmd->svmd_softlockcnt == 0); + kmem_free(svmd, sizeof (*svmd)); + seg->s_data = NULL; +} + +static int +segvmm_fault_obj(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + const uintptr_t end = va + len; + const int prot = svmd->svmd_prot; + const int uprot = prot | PROT_USER; + vm_object_t *vmo = svmd->svmd_vmo; + + ASSERT(vmo != NULL); + + va &= PAGEMASK; + uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off; + do { + pfn_t pfn; + + pfn = vm_object_pfn(vmo, off); + if (pfn == PFN_INVALID) { + return (FC_NOMAP); + } + + /* Ignore any large-page possibilities for now */ + hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD); + va += PAGESIZE; + off += PAGESIZE; + } while (va < end); + + return (0); +} + +static int +segvmm_fault_space(struct hat *hat, struct seg *seg, uintptr_t va, size_t len) +{ + segvmm_data_t *svmd = seg->s_data; + const uintptr_t end = va + len; + const int prot = svmd->svmd_prot; + const int uprot = prot | PROT_USER; + vm_client_t *vmc = svmd->svmd_vmc; + + ASSERT(vmc != NULL); + + va &= PAGEMASK; + uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off; + + do { + vm_page_t *vmp; + pfn_t pfn; + + vmp = vmc_hold(vmc, off, prot); + if (vmp == NULL) { + return (FC_NOMAP); + } + + pfn = vmp_get_pfn(vmp); + ASSERT3U(pfn, !=, PFN_INVALID); + + /* Ignore any large-page possibilities for now */ + hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD); + + if (vmp_release(vmp)) { + /* + * Region was unmapped from vmspace while we were + * loading it into this AS. Communicate it as if it + * were a fault. + */ + hat_unload(hat, (caddr_t)va, PAGESIZE, HAT_UNLOAD); + return (FC_NOMAP); + } + + va += PAGESIZE; + off += PAGESIZE; + } while (va < end); + + return (0); +} + +/* ARGSUSED */ +static faultcode_t +segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw rw) +{ + segvmm_data_t *svmd = seg->s_data; + int err = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + if (type == F_PROT) { + /* + * Since protection on the segment is fixed, there is nothing + * to do but report an error for protection faults. + */ + return (FC_PROT); + } else if (type == F_SOFTUNLOCK) { + size_t plen = btop(len); + + rw_enter(&svmd->svmd_lock, RW_WRITER); + VERIFY(svmd->svmd_softlockcnt >= plen); + svmd->svmd_softlockcnt -= plen; + rw_exit(&svmd->svmd_lock); + return (0); + } + + VERIFY(type == F_INVAL || type == F_SOFTLOCK); + rw_enter(&svmd->svmd_lock, RW_WRITER); + + if (svmd->svmd_vmo != NULL) { + err = segvmm_fault_obj(hat, seg, (uintptr_t)addr, len); + } else { + err = segvmm_fault_space(hat, seg, (uintptr_t)addr, len); + } + if (type == F_SOFTLOCK && err == 0) { + size_t nval = svmd->svmd_softlockcnt + btop(len); + + if (svmd->svmd_softlockcnt >= nval) { + rw_exit(&svmd->svmd_lock); + return (FC_MAKE_ERR(EOVERFLOW)); + } + svmd->svmd_softlockcnt = nval; + } + + rw_exit(&svmd->svmd_lock); + return (err); +} + +/* ARGSUSED */ +static faultcode_t +segvmm_faulta(struct seg *seg, caddr_t addr) +{ + /* Do nothing since asynch pagefault should not load translation. */ + return (0); +} + +/* ARGSUSED */ +static int +segvmm_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* The seg_vmm driver does not yet allow protection to be changed. */ + return (EACCES); +} + +/* ARGSUSED */ +static int +segvmm_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + segvmm_data_t *svmd = seg->s_data; + int error = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&svmd->svmd_lock, RW_READER); + if ((svmd->svmd_prot & prot) != prot) { + error = EACCES; + } + rw_exit(&svmd->svmd_lock); + return (error); +} + +/* ARGSUSED */ +static int +segvmm_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +segvmm_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t sz = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + len = (len + PAGEOFFSET) & PAGEMASK; + while (len > 0) { + *vec = 1; + sz += PAGESIZE; + vec++; + len -= PAGESIZE; + } + return (sz); +} + +/* ARGSUSED */ +static int +segvmm_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* Report success since kernel pages are always in memory. */ + return (0); +} + +static int +segvmm_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + segvmm_data_t *svmd = seg->s_data; + size_t pgno; + uint_t prot; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&svmd->svmd_lock, RW_READER); + prot = svmd->svmd_prot; + rw_exit(&svmd->svmd_lock); + + /* + * Reporting protection is simple since it is not tracked per-page. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = prot; + } + return (0); +} + +/* ARGSUSED */ +static u_offset_t +segvmm_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +segvmm_gettype(struct seg *seg, caddr_t addr) +{ + /* + * Since already-existing vmm reservoir pages are being mapped into + * userspace, always report the segment type as shared. + */ + return (MAP_SHARED); +} + +/* ARGSUSED */ +static int +segvmm_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + *vpp = NULL; + return (0); +} + +/* ARGSUSED */ +static int +segvmm_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + if (behav == MADV_PURGE) { + /* Purge does not make sense for this mapping */ + return (EINVAL); + } + /* Indicate success for everything else. */ + return (0); +} + +/* ARGSUSED */ +static void +segvmm_dump(struct seg *seg) +{ + /* + * Since this is a mapping to share kernel data with userspace, nothing + * additional should be dumped. + */ +} + +/* ARGSUSED */ +static int +segvmm_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +segvmm_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +static int +segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + segvmm_data_t *svmd = seg->s_data; + + memidp->val[0] = (uintptr_t)svmd->svmd_vmo; + memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_off; + return (0); +} + +/* ARGSUSED */ +static int +segvmm_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/sys/seg_vmm.h b/usr/src/uts/intel/io/vmm/sys/seg_vmm.h new file mode 100644 index 0000000000..5ba0dad5c3 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/seg_vmm.h @@ -0,0 +1,31 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VM_SEG_VMM_H +#define _VM_SEG_VMM_H + +#include <sys/vmm_vm.h> + +typedef struct segvmm_crargs { + uchar_t prot; /* protection */ + uintptr_t offset; + vm_object_t *vmo; + vm_client_t *vmc; +} segvmm_crargs_t; + +int segvmm_create(struct seg **, void *); + +#endif /* _VM_SEG_VMM_H */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_gpt.h b/usr/src/uts/intel/io/vmm/sys/vmm_gpt.h new file mode 100644 index 0000000000..a425fb53ec --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_gpt.h @@ -0,0 +1,91 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_GPT_H +#define _VMM_GPT_H + +#include <sys/types.h> + +/* + * Constants for the nodes in the GPT radix tree. Note + * that, in accordance with hardware page table descriptions, + * the root of the tree is referred to as "LEVEL4" while the + * leaf level is "LEVEL1". + */ +enum vmm_gpt_node_level { + LEVEL4 = 0, + LEVEL3, + LEVEL2, + LEVEL1, + MAX_GPT_LEVEL, +}; + +/* + * The vmm_pte_ops structure contains function pointers for format-specific + * operations on page table entries. The operations are as follows: + * + * vpeo_map_table: Creates a PTE that maps an inner node in the page table. + * vpeo_map_page: Creates a leaf entry PTE that maps a page of physical memory. + * vpeo_pte_pfn: Returns the PFN contained in the given PTE. + * vpeo_pte_is_present: Returns true IFF the PTE maps a present page. + * vpeo_pte_prot: Returns a bitmask of protection bits for the PTE. + * The bits correspond to the standard mmap(2) bits: PROT_READ, PROT_WRITE, + * PROT_EXEC. + * vpeo_reset_dirty: Resets the dirty bit on the given PTE. If the second + * argument is `true`, the bit will be set, otherwise it will be cleared. + * Returns non-zero if the previous value of the bit was set. + * vpeo_reset_accessed: Resets the accessed bit on the given PTE. If the + * second argument is `true`, the bit will be set, otherwise it will be + * cleared. Returns non-zero if the previous value of the bit was set. + * vpeo_get_pmtp: Generate a properly formatted PML4 (EPTP/nCR3), given the root + * PFN for the GPT. + */ +typedef struct vmm_pte_ops vmm_pte_ops_t; +struct vmm_pte_ops { + uint64_t (*vpeo_map_table)(pfn_t); + uint64_t (*vpeo_map_page)(pfn_t, uint_t, uint8_t); + pfn_t (*vpeo_pte_pfn)(uint64_t); + bool (*vpeo_pte_is_present)(uint64_t); + uint_t (*vpeo_pte_prot)(uint64_t); + uint_t (*vpeo_reset_dirty)(uint64_t *, bool); + uint_t (*vpeo_reset_accessed)(uint64_t *, bool); + uint64_t (*vpeo_get_pmtp)(pfn_t); +}; + +extern vmm_pte_ops_t ept_pte_ops; +extern vmm_pte_ops_t rvi_pte_ops; + +struct vmm_gpt; +typedef struct vmm_gpt vmm_gpt_t; + +vmm_gpt_t *vmm_gpt_alloc(vmm_pte_ops_t *); +void vmm_gpt_free(vmm_gpt_t *); + +uint64_t *vmm_gpt_lookup(vmm_gpt_t *, uint64_t); +void vmm_gpt_walk(vmm_gpt_t *, uint64_t, uint64_t **, enum vmm_gpt_node_level); +void vmm_gpt_populate_region(vmm_gpt_t *, uint64_t, uint64_t); +bool vmm_gpt_map_at(vmm_gpt_t *, uint64_t *, pfn_t, uint_t, uint8_t); +void vmm_gpt_vacate_region(vmm_gpt_t *, uint64_t, uint64_t); +bool vmm_gpt_map(vmm_gpt_t *, uint64_t, pfn_t, uint_t, uint8_t); +bool vmm_gpt_unmap(vmm_gpt_t *, uint64_t); +size_t vmm_gpt_unmap_region(vmm_gpt_t *, uint64_t, uint64_t); +uint64_t vmm_gpt_get_pmtp(vmm_gpt_t *); + +bool vmm_gpt_is_mapped(vmm_gpt_t *, uint64_t *, pfn_t *, uint_t *); +uint_t vmm_gpt_reset_accessed(vmm_gpt_t *, uint64_t *, bool); +uint_t vmm_gpt_reset_dirty(vmm_gpt_t *, uint64_t *, bool); + +#endif /* _VMM_GPT_H */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_impl.h b/usr/src/uts/intel/io/vmm/sys/vmm_impl.h new file mode 100644 index 0000000000..2b6f41ec54 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_impl.h @@ -0,0 +1,97 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_IMPL_H_ +#define _VMM_IMPL_H_ + +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/varargs.h> +#include <sys/zone.h> +#include <sys/kstat.h> +#include <sys/vmm.h> + +#ifdef _KERNEL + +#define VMM_CTL_MINOR 0 + +/* + * Rather than creating whole character devices for devmem mappings, they are + * available by mmap(2)ing the vmm handle at a specific offset. These offsets + * begin just above the maximum allow guest physical address. + */ +#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1) + +struct vmm_devmem_entry { + list_node_t vde_node; + int vde_segid; + char vde_name[VM_MAX_SEG_NAMELEN]; + size_t vde_len; + off_t vde_off; +}; +typedef struct vmm_devmem_entry vmm_devmem_entry_t; + +typedef struct vmm_zsd vmm_zsd_t; + +enum vmm_softc_state { + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ + VMM_CLEANUP = 2, /* request that holds are released */ + VMM_PURGED = 4, /* all hold have been released */ + VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ + VMM_DESTROY = 16 /* VM is destroyed, softc still around */ +}; + +struct vmm_softc { + list_node_t vmm_node; + struct vm *vmm_vm; + minor_t vmm_minor; + char vmm_name[VM_MAX_NAMELEN]; + list_t vmm_devmem_list; + + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; + + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; + + /* For zone specific data */ + list_node_t vmm_zsd_linkage; + zone_t *vmm_zone; + vmm_zsd_t *vmm_zsd; + + kstat_t *vmm_kstat_vm; + kstat_t *vmm_kstat_vcpu[VM_MAXCPU]; +}; +typedef struct vmm_softc vmm_softc_t; + +void vmm_zsd_init(void); +void vmm_zsd_fini(void); +int vmm_zsd_add_vm(vmm_softc_t *sc); +void vmm_zsd_rem_vm(vmm_softc_t *sc); +int vmm_do_vm_destroy(vmm_softc_t *, boolean_t); + +#define VMM_MODULE_NAME "vmm" + +#endif /* _KERNEL */ + +#endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_instruction_emul.h b/usr/src/uts/intel/io/vmm/sys/vmm_instruction_emul.h new file mode 100644 index 0000000000..4680c86a56 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_instruction_emul.h @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +#include <sys/mman.h> +#include <machine/vmm.h> + +struct vie; + +struct vie *vie_alloc(); +void vie_free(struct vie *); + +enum vm_reg_name vie_regnum_map(uint8_t); + +void vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, + const struct vm_guest_paging *paging, uint64_t gpa); +void vie_init_inout(struct vie *vie, const struct vm_inout *inout, + uint8_t inst_len, const struct vm_guest_paging *paging); +void vie_init_other(struct vie *vie, const struct vm_guest_paging *paging); + +int vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *res); +int vie_fulfill_inout(struct vie *vie, const struct vm_inout *res); + +bool vie_needs_fetch(const struct vie *vie); +bool vie_pending(const struct vie *vie); +uint64_t vie_mmio_gpa(const struct vie *vie); +void vie_exitinfo(const struct vie *vie, struct vm_exit *vme); +void vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme); +void vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, + uint64_t *cs_base, int *cs_d); + +void vie_reset(struct vie *vie); +void vie_advance_pc(struct vie *vie, uint64_t *nextrip); + +int vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid); +int vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid); +int vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid); + +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vie_fetch_instruction()' + */ +int vie_fetch_instruction(struct vie *vie, struct vm *vm, int cpuid, + uint64_t rip, int *is_fault); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +/* + * Like vm_gla2gpa, but no exceptions are injected into the guest and + * PTEs are not changed. + */ +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, + struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, + int *is_fault); + +int vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla); +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int csd); + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h new file mode 100644 index 0000000000..c84b33dc2e --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h @@ -0,0 +1,438 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _VMM_KERNEL_H_ +#define _VMM_KERNEL_H_ + +#include <sys/sdt.h> +#include <x86/segments.h> +#include <sys/vmm.h> + +SDT_PROVIDER_DECLARE(vmm); + +struct vm; +struct vm_exception; +struct seg_desc; +struct vm_exit; +struct vie; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vmspace; +struct vm_client; +struct vm_object; +struct vm_guest_paging; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + const struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vlapic *(*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +typedef void (*vmi_savectx)(void *vmi, int vcpu); +typedef void (*vmi_restorectx)(void *vmi, int vcpu); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; + + vmi_savectx vmsavectx; + vmi_restorectx vmrestorectx; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +int vm_create(const char *name, uint64_t flags, struct vm **retvm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm, uint64_t); +const char *vm_name(struct vm *vm); +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); + +/* + * APIs that race against hardware. + */ +void vm_track_dirty_pages(struct vm *, uint64_t, size_t, uint8_t *); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +int vm_assign_pptdev(struct vm *vm, int pptfd); +int vm_unassign_pptdev(struct vm *vm, int pptfd); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + const struct seg_desc *desc); +int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, + uint8_t *sipi_vec); +int vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, + uint8_t sipi_vec); +int vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len); +int vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len); +int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +int vm_inject_init(struct vm *vm, int vcpuid); +int vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vec); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +struct vie *vm_vie_ctx(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip); +int vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, + int rsize); +int vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, + int wsize); + +#ifdef _SYS__CPUSET_H_ +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +bool vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip); +bool vcpu_run_state_pending(struct vm *vm, int vcpuid); +int vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only); + +/* + * Return true if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return false otherwise. + */ +bool vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); + +uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj); + +static __inline int +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_THREAD_H +static __inline int +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->t_astflag) + return (1); + else if (CPU->cpu_runrun) + return (1); + else + return (0); +} +#endif /* _SYS_THREAD_H */ + +typedef enum vcpu_notify { + VCPU_NOTIFY_NONE, + VCPU_NOTIFY_APIC, /* Posted intr notification (if possible) */ + VCPU_NOTIFY_EXIT, /* IPI to cause VM exit */ +} vcpu_notify_t; + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid); +void vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t); +struct vmspace *vm_get_vmspace(struct vm *vm); +struct vm_client *vm_get_vmclient(struct vm *vm, int vcpuid); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); + +/* + * Inject exception 'vector' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + int prot; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * retval is_fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + uint_t num_copyinfo, int *is_fault); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + uint_t num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); + +/* APIs to inject faults into the guest */ +void vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +void vm_inject_ud(struct vm *vm, int vcpuid); +void vm_inject_gp(struct vm *vm, int vcpuid); +void vm_inject_ac(struct vm *vm, int vcpuid, int errcode); +void vm_inject_ss(struct vm *vm, int vcpuid, int errcode); +void vm_inject_pf(struct vm *vm, int vcpuid, int errcode, uint64_t cr2); + +/* + * Both SVM and VMX have complex logic for injecting events such as exceptions + * or interrupts into the guest. Within those two backends, the progress of + * event injection is tracked by event_inject_state, hopefully making it easier + * to reason about. + */ +enum event_inject_state { + EIS_CAN_INJECT = 0, /* exception/interrupt can be injected */ + EIS_EV_EXISTING = 1, /* blocked by existing event */ + EIS_EV_INJECTED = 2, /* blocked by injected event */ + EIS_GI_BLOCK = 3, /* blocked by guest interruptability */ + + /* + * Flag to request an immediate exit from VM context after event + * injection in order to perform more processing + */ + EIS_REQ_EXIT = (1 << 15), +}; + +/* Possible result codes for MSR access emulation */ +typedef enum vm_msr_result { + VMR_OK = 0, /* succesfully emulated */ + VMR_GP = 1, /* #GP should be injected */ + VMR_UNHANLDED = 2, /* handle in userspace, kernel cannot emulate */ +} vm_msr_result_t; + +void vmm_sol_glue_init(void); +void vmm_sol_glue_cleanup(void); + +int vmm_mod_load(void); +int vmm_mod_unload(void); + +void vmm_call_trap(uint64_t); + +/* + * Because of tangled headers, this is not exposed directly via the vmm_drv + * interface, but rather mirrored as vmm_drv_iop_cb_t in vmm_drv.h. + */ +typedef int (*ioport_handler_t)(void *, bool, uint16_t, uint8_t, uint32_t *); + +int vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); + +int vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, + void *arg, void **cookie); +int vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, + void **old_arg); + +int vm_ioport_hook(struct vm *, uint16_t, ioport_handler_t, void *, void **); +void vm_ioport_unhook(struct vm *, void **); + +enum vcpu_ustate { + VU_INIT = 0, /* initialized but has not yet attempted to run */ + VU_RUN, /* running in guest context */ + VU_IDLE, /* idle (HLTed, wait-for-SIPI, etc) */ + VU_EMU_KERN, /* emulation performed in-kernel */ + VU_EMU_USER, /* emulation performed in userspace */ + VU_SCHED, /* off-cpu for interrupt, preempt, lock contention */ + VU_MAX +}; + +void vcpu_ustate_change(struct vm *, int, enum vcpu_ustate); + +typedef struct vmm_kstats { + kstat_named_t vk_name; +} vmm_kstats_t; + +typedef struct vmm_vcpu_kstats { + kstat_named_t vvk_vcpu; + kstat_named_t vvk_time_init; + kstat_named_t vvk_time_run; + kstat_named_t vvk_time_idle; + kstat_named_t vvk_time_emu_kern; + kstat_named_t vvk_time_emu_user; + kstat_named_t vvk_time_sched; +} vmm_vcpu_kstats_t; + +#define VMM_KSTAT_CLASS "misc" + +int vmm_kstat_update_vcpu(struct kstat *, int); + +#endif /* _VMM_KERNEL_H_ */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_reservoir.h b/usr/src/uts/intel/io/vmm/sys/vmm_reservoir.h new file mode 100644 index 0000000000..b8215ce654 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_reservoir.h @@ -0,0 +1,40 @@ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _SYS_VMM_RESERVOIR_H +#define _SYS_VMM_RESERVOIR_H + +#include <sys/types.h> +#include <sys/cred.h> + +struct vmmr_region; +typedef struct vmmr_region vmmr_region_t; + +void vmmr_init(); +void vmmr_fini(); +bool vmmr_is_empty(); + +int vmmr_alloc(size_t, bool, vmmr_region_t **); +void *vmmr_region_mem_at(vmmr_region_t *, uintptr_t); +pfn_t vmmr_region_pfn_at(vmmr_region_t *, uintptr_t); +void vmmr_free(vmmr_region_t *); + +int vmmr_add(size_t, bool); +int vmmr_remove(size_t, bool); + +int vmmr_ioctl(int, intptr_t, int, cred_t *, int *); + +#endif /* _SYS_VMM_RESERVOIR_H */ diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_vm.h b/usr/src/uts/intel/io/vmm/sys/vmm_vm.h new file mode 100644 index 0000000000..57d0ec8b00 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/sys/vmm_vm.h @@ -0,0 +1,89 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_VM_H +#define _VMM_VM_H + +#include <sys/types.h> + +typedef struct vmspace vmspace_t; +typedef struct vm_client vm_client_t; +typedef struct vm_page vm_page_t; +typedef struct vm_object vm_object_t; + +struct vmm_pte_ops; + +typedef void (*vmc_inval_cb_t)(void *, uintptr_t, size_t); + +/* vmspace_t operations */ +vmspace_t *vmspace_alloc(size_t, struct vmm_pte_ops *, bool); +void vmspace_destroy(vmspace_t *); +int vmspace_map(vmspace_t *, vm_object_t *, uintptr_t, uintptr_t, size_t, + uint8_t); +int vmspace_unmap(vmspace_t *, uintptr_t, uintptr_t); +int vmspace_populate(vmspace_t *, uintptr_t, uintptr_t); +vm_client_t *vmspace_client_alloc(vmspace_t *); +uint64_t vmspace_table_root(vmspace_t *); +uint64_t vmspace_table_gen(vmspace_t *); +uint64_t vmspace_resident_count(vmspace_t *); +void vmspace_track_dirty(vmspace_t *, uint64_t, size_t, uint8_t *); + +/* vm_client_t operations */ +vm_page_t *vmc_hold(vm_client_t *, uintptr_t, int); +uint64_t vmc_table_enter(vm_client_t *); +void vmc_table_exit(vm_client_t *); +int vmc_fault(vm_client_t *, uintptr_t, int); +vm_client_t *vmc_clone(vm_client_t *); +int vmc_set_inval_cb(vm_client_t *, vmc_inval_cb_t, void *); +void vmc_destroy(vm_client_t *); + +/* vm_object_t operations */ +vm_object_t *vm_object_mem_allocate(size_t, bool); +vm_object_t *vmm_mmio_alloc(vmspace_t *, uintptr_t, size_t, uintptr_t); +void vm_object_reference(vm_object_t *); +void vm_object_release(vm_object_t *); +pfn_t vm_object_pfn(vm_object_t *, uintptr_t); + +/* vm_page_t operations */ +const void *vmp_get_readable(const vm_page_t *); +void *vmp_get_writable(const vm_page_t *); +pfn_t vmp_get_pfn(const vm_page_t *); +void vmp_chain(vm_page_t *, vm_page_t *); +vm_page_t *vmp_next(const vm_page_t *); +bool vmp_release(vm_page_t *); +bool vmp_release_chain(vm_page_t *); + +/* seg_vmm mapping */ +struct vm; +int vm_segmap_obj(struct vm *, int, off_t, off_t, struct as *, caddr_t *, + uint_t, uint_t, uint_t); +int vm_segmap_space(struct vm *, off_t, struct as *, caddr_t *, off_t, uint_t, + uint_t, uint_t); + +/* Glue functions */ +vm_paddr_t vtophys(void *); +void invalidate_cache_all(void); + +/* + * The VM_MAXUSER_ADDRESS determines the upper size limit of a vmspace. + * This value is sized well below the host userlimit, halving the + * available space below the VA hole to avoid Intel EPT limits and + * leave room available in the usable VA range for other mmap tricks. + */ +#define VM_MAXUSER_ADDRESS 0x00003ffffffffffful + +#endif /* _VMM_VM_H */ diff --git a/usr/src/uts/intel/io/vmm/vmm.c b/usr/src/uts/intel/io/vmm/vmm.c new file mode 100644 index 0000000000..be181781de --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm.c @@ -0,0 +1,3676 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/systm.h> +#include <sys/sunddi.h> +#include <sys/hma.h> + +#include <machine/md_var.h> +#include <x86/psl.h> +#include <x86/apicreg.h> + +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmparam.h> +#include <sys/vmm_instruction_emul.h> +#include <sys/vmm_vm.h> +#include <sys/vmm_gpt.h> + +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_host.h" +#include "vmm_util.h" +#include "vatpic.h" +#include "vatpit.h" +#include "vhpet.h" +#include "vioapic.h" +#include "vlapic.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_stat.h" +#include "vmm_lapic.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +/* Flags for vtc_status */ +#define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ +#define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ + +typedef struct vm_thread_ctx { + struct vm *vtc_vm; + int vtc_vcpuid; + uint_t vtc_status; + enum vcpu_ustate vtc_ustate; +} vm_thread_ctx_t; + +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ +struct vcpu { + /* (o) protects state, run_state, hostcpu, sipi_vector */ + kmutex_t lock; + + enum vcpu_state state; /* (o) vcpu state */ + enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ + kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ + kcondvar_t state_cv; /* (o) IDLE-transition cv */ + int hostcpu; /* (o) vcpu's current host cpu */ + int lastloccpu; /* (o) last host cpu localized to */ + int reqidle; /* (i) request vcpu to idle */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + uint8_t sipi_vector; /* (i) SIPI vector */ + hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ + struct vie *vie_ctx; /* (x) instruction emulation context */ + vm_client_t *vmclient; /* (a) VM-system client */ + uint64_t tsc_offset; /* (x) offset from host TSC */ + + enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ + hrtime_t ustate_when; /* (i) time of last ustate change */ + uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ + vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ + struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ +}; + +#define vcpu_lock(v) mutex_enter(&((v)->lock)) +#define vcpu_unlock(v) mutex_exit(&((v)->lock)) +#define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) + +struct mem_seg { + size_t len; + bool sysmem; + vm_object_t *object; +}; +#define VM_MAX_MEMSEGS 4 + +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 8 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ + uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ + + struct ioport_config ioports; /* (o) ioport handling */ + + bool mem_transient; /* (o) alloc transient memory */ +}; + +static int vmm_initialized; + + +static void +nullop_panic(void) +{ + panic("null vmm operation call"); +} + +/* Do not allow use of an un-set `ops` to do anything but panic */ +static struct vmm_ops vmm_ops_null = { + .init = (vmm_init_func_t)nullop_panic, + .cleanup = (vmm_cleanup_func_t)nullop_panic, + .resume = (vmm_resume_func_t)nullop_panic, + .vminit = (vmi_init_func_t)nullop_panic, + .vmrun = (vmi_run_func_t)nullop_panic, + .vmcleanup = (vmi_cleanup_func_t)nullop_panic, + .vmgetreg = (vmi_get_register_t)nullop_panic, + .vmsetreg = (vmi_set_register_t)nullop_panic, + .vmgetdesc = (vmi_get_desc_t)nullop_panic, + .vmsetdesc = (vmi_set_desc_t)nullop_panic, + .vmgetcap = (vmi_get_cap_t)nullop_panic, + .vmsetcap = (vmi_set_cap_t)nullop_panic, + .vlapic_init = (vmi_vlapic_init)nullop_panic, + .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, + .vmsavectx = (vmi_savectx)nullop_panic, + .vmrestorectx = (vmi_restorectx)nullop_panic, +}; + +static struct vmm_ops *ops = &vmm_ops_null; +static vmm_pte_ops_t *pte_ops = NULL; + +#define VMM_INIT() ((*ops->init)()) +#define VMM_CLEANUP() ((*ops->cleanup)()) +#define VMM_RESUME() ((*ops->resume)()) + +#define VMINIT(vm) ((*ops->vminit)(vm)) +#define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) +#define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) + +#define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) +#define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) +#define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) +#define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) +#define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) +#define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) +#define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) +#define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +SDT_PROVIDER_DEFINE(vmm); + +static MALLOC_DEFINE(M_VM, "vm", "vm"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; + +/* Trap into hypervisor on all guest exceptions and reflect them back */ +static int trace_guest_exceptions; + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); +static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); +static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); + +static void vmm_savectx(void *); +static void vmm_restorectx(void *); +static const struct ctxop_template vmm_ctxop_tpl = { + .ct_rev = CTXOP_TPL_REV, + .ct_save = vmm_savectx, + .ct_restore = vmm_restorectx, +}; + +#ifdef KTR +static const char * +vcpu_state2str(enum vcpu_state state) +{ + + switch (state) { + case VCPU_IDLE: + return ("idle"); + case VCPU_FROZEN: + return ("frozen"); + case VCPU_RUNNING: + return ("running"); + case VCPU_SLEEPING: + return ("sleeping"); + default: + return ("unknown"); + } +} +#endif + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); + if (destroy) { + vmm_stat_free(vcpu->stats); + + hma_fpu_free(vcpu->guestfpu); + vcpu->guestfpu = NULL; + + vie_free(vcpu->vie_ctx); + vcpu->vie_ctx = NULL; + + vmc_destroy(vcpu->vmclient); + vcpu->vmclient = NULL; + + ctxop_free(vcpu->ctxop); + mutex_destroy(&vcpu->lock); + } +} + +static void +vcpu_init(struct vm *vm, int vcpu_id, bool create) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { + mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); + + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; + vcpu->lastloccpu = NOCPU; + vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); + vcpu->stats = vmm_stat_alloc(); + vcpu->vie_ctx = vie_alloc(); + + vcpu->ustate = VU_INIT; + vcpu->ustate_when = gethrtime(); + + vcpu->vtc.vtc_vm = vm; + vcpu->vtc.vtc_vcpuid = vcpu_id; + vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); + } else { + vie_reset(vcpu->vie_ctx); + bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); + if (vcpu->ustate != VU_INIT) { + vcpu_ustate_change(vm, vcpu_id, VU_INIT); + } + } + + vcpu->run_state = VRS_HALT; + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->reqidle = 0; + vcpu->exitintinfo = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + hma_fpu_init(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); + vcpu->tsc_offset = 0; +} + +int +vcpu_trace_exceptions(struct vm *vm, int vcpuid) +{ + + return (trace_guest_exceptions); +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= vm->maxcpus) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +struct vie * +vm_vie_ctx(struct vm *vm, int cpuid) +{ + if (cpuid < 0 || cpuid >= vm->maxcpus) + panic("vm_vie_ctx: invalid cpuid %d", cpuid); + + return (vm->vcpu[cpuid].vie_ctx); +} + +static int +vmm_init(void) +{ + vmm_host_state_init(); + + if (vmm_is_intel()) { + ops = &vmm_ops_intel; + pte_ops = &ept_pte_ops; + } else if (vmm_is_svm()) { + ops = &vmm_ops_amd; + pte_ops = &rvi_pte_ops; + } else { + return (ENXIO); + } + + return (VMM_INIT()); +} + +int +vmm_mod_load() +{ + int error; + + VERIFY(vmm_initialized == 0); + + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + + return (error); +} + +int +vmm_mod_unload() +{ + int error; + + VERIFY(vmm_initialized == 1); + + iommu_cleanup(); + error = VMM_CLEANUP(); + if (error) + return (error); + vmm_initialized = 0; + + return (0); +} + +static void +vm_init(struct vm *vm, bool create) +{ + int i; + + vm->cookie = VMINIT(vm); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); + + vm_inout_init(vm, &vm->ioports); + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, create); + + /* + * Configure the VM-wide TSC offset so that the call to vm_init() + * represents the boot time (when the TSC(s) read 0). Each vCPU will + * have its own offset from this, which is altered if/when the guest + * writes to MSR_TSC. + * + * The TSC offsetting math is all unsigned, using overflow for negative + * offets. A reading of the TSC is negated to form the boot offset. + */ + vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); +} + +/* + * The default CPU topology is a single thread per package. + */ +uint_t cores_per_package = 1; +uint_t threads_per_core = 1; + +/* + * Debugging tunable to enable dirty-page-tracking. + * (Remains off by default for now) + */ +bool gpt_track_dirty = false; + +int +vm_create(const char *name, uint64_t flags, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + /* Name validation has already occurred */ + VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); + + vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + + vm->vmspace = vmspace; + vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; + for (uint_t i = 0; i < VM_MAXCPU; i++) { + vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); + } + + vm->sockets = 1; + vm->cores = cores_per_package; /* XXX backwards compatibility */ + vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + if (maxcpus != 0) + return (EINVAL); /* XXX remove when supported */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + /* XXX need to check sockets * cores * threads == vCPU, how? */ + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + return (0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + struct mem_map *mm; + int i; + + ppt_unassign_all(vm); + + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + + /* + * Devices which attach their own ioport hooks should be cleaned up + * first so they can tear down those registrations. + */ + vpmtmr_cleanup(vm->vpmtmr); + + vm_inout_cleanup(vm, &vm->ioports); + + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_cleanup(vm, i, destroy); + + VMCLEANUP(vm->cookie); + + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) { + vm_free_memmap(vm, i); + } else { + /* + * We need to reset the IOMMU flag so this mapping can + * be reused when a VM is rebooted. Since the IOMMU + * domain has already been destroyed we can just reset + * the flag here. + */ + mm->flags &= ~VM_MEMMAP_F_IOMMU; + } + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + vmspace_destroy(vm->vmspace); + vm->vmspace = NULL; + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VM); +} + +int +vm_reinit(struct vm *vm, uint64_t flags) +{ + /* A virtual machine can be reset only if all vcpus are suspended. */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { + if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { + return (EBUSY); + } + + /* + * Force the VM (and all its vCPUs) into a suspended state. + * This should be quick and easy, since the vm_reinit() call is + * made while holding the VM write lock, which requires holding + * all of the vCPUs in the VCPU_FROZEN state. + */ + (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, + VM_SUSPEND_RESET); + for (uint_t i = 0; i < vm->maxcpus; i++) { + struct vcpu *vcpu = &vm->vcpu[i]; + + if (CPU_ISSET(i, &vm->suspended_cpus) || + !CPU_ISSET(i, &vm->active_cpus)) { + continue; + } + + vcpu_lock(vcpu); + VERIFY3U(vcpu->state, ==, VCPU_FROZEN); + CPU_SET_ATOMIC(i, &vm->suspended_cpus); + vcpu_unlock(vcpu); + } + + VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); + } + + vm_cleanup(vm, false); + vm_init(vm, false); + return (0); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + vm_object_t *obj; + + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); +} + +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) +{ + struct mem_map *mm; + int i; + +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ + } + + if (ppt_is_mmio(vm, gpa)) + return (true); /* 'gpa' is pci passthru mmio */ + + return (false); +} + +int +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +{ + struct mem_seg *seg; + vm_object_t *obj; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + if (len == 0 || (len & PAGE_MASK)) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } + + obj = vm_object_mem_allocate(len, vm->mem_transient); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t **objptr) +{ + struct mem_seg *seg; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_release(seg->object); + bzero(seg, sizeof (struct mem_seg)); + } +} + +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); + if (error != 0) + return (EFAULT); + + vm_object_reference(seg->object); + + if ((flags & VM_MEMMAP_F_WIRED) != 0) { + error = vmspace_populate(vm->vmspace, gpa, gpa + len); + if (error != 0) { + vmspace_unmap(vm->vmspace, gpa, gpa + len); + return (EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + struct mem_map *m; + int i; + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->gpa == gpa && m->len == len && + (m->flags & VM_MEMMAP_F_IOMMU) == 0) { + vm_free_memmap(vm, i); + return (0); + } + } + + return (EINVAL); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vmspace_unmap(vm->vmspace, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == 0, ("%s: vmspace_unmap error %d", + __func__, error)); + bzero(mm, sizeof (struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +static void +vm_iommu_modify(struct vm *vm, bool map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_map *mm; +#ifdef __FreeBSD__ + void *vp, *cookie, *host_domain; +#endif + vm_client_t *vmc; + + sz = PAGE_SIZE; +#ifdef __FreeBSD__ + host_domain = iommu_host_domain(); +#endif + vmc = vmspace_client_alloc(vm->vmspace); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (!sysmem_mapping(vm, mm)) + continue; + + if (map) { + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %lx/%lx/%x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + } else { + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %lx/%lx/%x", + mm->gpa, mm->len, mm->flags)); + } + + gpa = mm->gpa; + while (gpa < mm->gpa + mm->len) { + vm_page_t *vmp; + + vmp = vmc_hold(vmc, gpa, PROT_WRITE); + ASSERT(vmp != NULL); + hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); + vmp_release(vmp); + + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); +#ifdef __FreeBSD__ + iommu_remove_mapping(host_domain, hpa, sz); +#endif + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); +#ifdef __FreeBSD__ + iommu_create_mapping(host_domain, hpa, hpa, sz); +#endif + } + + gpa += PAGE_SIZE; + } + } + vmc_destroy(vmc); + + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ +#ifdef __FreeBSD__ + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +#else + iommu_invalidate_tlb(vm->iommu); +#endif +} + +int +vm_unassign_pptdev(struct vm *vm, int pptfd) +{ + int error; + + error = ppt_unassign_device(vm, pptfd); + if (error) + return (error); + + if (ppt_assigned_devices(vm) == 0) + vm_iommu_modify(vm, false); + + return (0); +} + +int +vm_assign_pptdev(struct vm *vm, int pptfd) +{ + int error; + vm_paddr_t maxaddr; + + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_sysmem_maxaddr(vm); + vm->iommu = iommu_create_domain(maxaddr); + if (vm->iommu == NULL) + return (ENXIO); + vm_iommu_modify(vm, true); + } + + error = ppt_assign_device(vm, pptfd); + return (error); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); +} + +static bool +is_descriptor_table(int reg) +{ + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (true); + default: + return (false); + } +} + +static bool +is_segment_register(int reg) +{ + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (true); + default: + return (false); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +static int +translate_hma_xsave_result(hma_fpu_xsave_result_t res) +{ + switch (res) { + case HFXR_OK: + return (0); + case HFXR_NO_SPACE: + return (ENOSPC); + case HFXR_BAD_ALIGN: + case HFXR_UNSUP_FMT: + case HFXR_UNSUP_FEAT: + case HFXR_INVALID_DATA: + return (EINVAL); + default: + panic("unexpected xsave result"); + } +} + +int +vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + hma_fpu_xsave_result_t res; + + res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); + return (translate_hma_xsave_result(res)); +} + +int +vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + hma_fpu_xsave_result_t res; + + res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); + return (translate_hma_xsave_result(res)); +} + +int +vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + *state = vcpu->run_state; + *sipi_vec = vcpu->sipi_vector; + vcpu_unlock(vcpu); + + return (0); +} + +int +vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + if (!VRS_IS_VALID(state)) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->run_state = state; + vcpu->sipi_vector = sipi_vec; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + + return (0); +} + +void +vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) +{ + vmspace_t *vms = vm_get_vmspace(vm); + vmspace_track_dirty(vms, gpa, len, bitmap); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + /* Save host FPU and restore guest FPU */ + fpu_stop_emulating(); + hma_fpu_start_guest(vcpu->guestfpu); + + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + fpu_start_emulating(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + + /* save guest FPU and restore host FPU */ + fpu_stop_emulating(); + hma_fpu_stop_guest(vcpu->guestfpu); + /* + * When the host state has been restored, we should not re-enable + * CR0.TS on illumos for eager FPU. + */ +} + +static int +vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + struct vcpu *vcpu; + int error; + + vcpu = &vm->vcpu[vcpuid]; + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); + cv_wait(&vcpu->state_cv, &vcpu->lock); + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", + vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) { + cv_broadcast(&vcpu->state_cv); + } + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) +{ + struct vcpu *vcpu; + int vcpu_halted, vm_halted; + bool userspace_exit = false; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_halted = 0; + vm_halted = 0; + + vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending interrupts (including NMI and + * INIT) before putting this thread to sleep. + */ + if (vm_nmi_pending(vm, vcpuid)) + break; + if (vcpu_run_state_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } + + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + userspace_exit = true; + break; + } + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } + } + + vcpu_ustate_change(vm, vcpuid, VU_IDLE); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); + } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + + vcpu_unlock(vcpu); + + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (userspace_exit ? -1 : 0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + vm_client_t *vmc = vcpu->vmclient; + struct vm_exit *vme = &vcpu->exitinfo; + int rv, ftype; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == PROT_READ || + ftype == PROT_WRITE || ftype == PROT_EXEC, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); + + VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != 0) + return (EFAULT); + return (0); +} + +int +vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, + int rsize) +{ + int err = ESRCH; + + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + struct vlapic *vlapic = vm_lapic(vm, cpuid); + + err = vlapic_mmio_read(vlapic, gpa, rval, rsize); + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); + } + + return (err); +} + +int +vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, + int wsize) +{ + int err = ESRCH; + + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + struct vlapic *vlapic = vm_lapic(vm, cpuid); + + err = vlapic_mmio_write(vlapic, gpa, wval, wsize); + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); + } + + return (err); +} + +static int +vm_handle_mmio_emul(struct vm *vm, int vcpuid) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t inst_addr; + int error, fault, cs_d; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + vie = vcpu->vie_ctx; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + inst_addr = vme->rip + vme->u.mmio_emul.cs_base; + cs_d = vme->u.mmio_emul.cs_d; + + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx", + vme->u.mmio_emul.gpa); + + /* Fetch the faulting instruction */ + if (vie_needs_fetch(vie)) { + error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, + &fault); + if (error != 0) { + return (error); + } else if (fault) { + /* + * If a fault during instruction fetch was encountered, + * it will have asserted that the appropriate exception + * be injected at next entry. + * No further work is required. + */ + return (0); + } + } + + if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx", + inst_addr); + /* Dump (unrecognized) instruction bytes in userspace */ + vie_fallback_exitinfo(vie, vme); + return (-1); + } + if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && + vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { + /* Decoded GLA does not match GLA from VM exit state */ + vie_fallback_exitinfo(vie, vme); + return (-1); + } + +repeat: + error = vie_emulate_mmio(vie, vm, vcpuid); + if (error < 0) { + /* + * MMIO not handled by any of the in-kernel-emulated devices, so + * make a trip out to userspace for it. + */ + vie_exitinfo(vie, vme); + } else if (error == EAGAIN) { + /* + * Continue emulating the rep-prefixed instruction, which has + * not completed its iterations. + * + * In case this can be emulated in-kernel and has a high + * repetition count (causing a tight spin), it should be + * deferential to yield conditions. + */ + if (!vcpu_should_yield(vm, vcpuid)) { + goto repeat; + } else { + /* + * Defer to the contending load by making a trip to + * userspace with a no-op (BOGUS) exit reason. + */ + vie_reset(vie); + vme->exitcode = VM_EXITCODE_BOGUS; + return (-1); + } + } else if (error == 0) { + /* Update %rip now that instruction has been emulated */ + vie_advance_pc(vie, &vcpu->nextrip); + } + return (error); +} + +static int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vcpu *vcpu; + struct vie *vie; + int err; + + vcpu = &vm->vcpu[vcpuid]; + vie = vcpu->vie_ctx; + +repeat: + err = vie_emulate_inout(vie, vm, vcpuid); + + if (err < 0) { + /* + * In/out not handled by any of the in-kernel-emulated devices, + * so make a trip out to userspace for it. + */ + vie_exitinfo(vie, vme); + return (err); + } else if (err == EAGAIN) { + /* + * Continue emulating the rep-prefixed ins/outs, which has not + * completed its iterations. + * + * In case this can be emulated in-kernel and has a high + * repetition count (causing a tight spin), it should be + * deferential to yield conditions. + */ + if (!vcpu_should_yield(vm, vcpuid)) { + goto repeat; + } else { + /* + * Defer to the contending load by making a trip to + * userspace with a no-op (BOGUS) exit reason. + */ + vie_reset(vie); + vme->exitcode = VM_EXITCODE_BOGUS; + return (-1); + } + } else if (err != 0) { + /* Emulation failure. Bail all the way out to userspace. */ + vme->exitcode = VM_EXITCODE_INST_EMUL; + bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); + return (-1); + } + + vie_advance_pc(vie, &vcpu->nextrip); + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t cs_base; + int error, fault, cs_d; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + vie = vcpu->vie_ctx; + + vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); + + /* Fetch the faulting instruction */ + ASSERT(vie_needs_fetch(vie)); + error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, + &fault); + if (error != 0) { + return (error); + } else if (fault) { + /* + * If a fault during instruction fetch was encounted, it will + * have asserted that the appropriate exception be injected at + * next entry. No further work is required. + */ + return (0); + } + + if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { + /* Dump (unrecognized) instruction bytes in userspace */ + vie_fallback_exitinfo(vie, vme); + return (-1); + } + + error = vie_emulate_other(vie, vm, vcpuid); + if (error != 0) { + /* + * Instruction emulation was unable to complete successfully, so + * kick it out to userspace for handling. + */ + vie_fallback_exitinfo(vie, vme); + } else { + /* Update %rip now that instruction has been emulated */ + vie_advance_pc(vie, &vcpu->nextrip); + } + return (error); +} + +static int +vm_handle_suspend(struct vm *vm, int vcpuid) +{ + int i; + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + */ + vcpu_lock(vcpu); + vcpu_ustate_change(vm, vcpuid, VU_INIT); + while (1) { + int rc; + + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, + TR_CLOCK_TICK); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + + /* + * If the userspace process driving the instance is killed, any + * vCPUs yet to be marked suspended (because they are not + * VM_RUN-ing in the kernel presently) will never reach that + * state. + * + * To avoid vm_handle_suspend() getting stuck in the kernel + * waiting for those vCPUs, offer a bail-out even though it + * means returning without all vCPUs in a suspended state. + */ + if (rc <= 0) { + if ((curproc->p_flag & SEXITING) != 0) { + break; + } + } + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i); + } + } + + return (-1); +} + +static int +vm_handle_reqidle(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); + vcpu->reqidle = 0; + vcpu_unlock(vcpu); + return (-1); +} + +static int +vm_handle_run_state(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + bool handled = false; + + vcpu_lock(vcpu); + while (1) { + if ((vcpu->run_state & VRS_PEND_INIT) != 0) { + vcpu_unlock(vcpu); + VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); + vcpu->run_state |= VRS_INIT; + } + + if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == + (VRS_INIT | VRS_PEND_SIPI)) { + const uint8_t vector = vcpu->sipi_vector; + + vcpu_unlock(vcpu); + VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~VRS_PEND_SIPI; + vcpu->run_state |= VRS_RUN; + } + + /* + * If the vCPU is now in the running state, there is no need to + * wait for anything prior to re-entry. + */ + if ((vcpu->run_state & VRS_RUN) != 0) { + handled = true; + break; + } + + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + break; + } + + vcpu_ustate_change(vm, vcpuid, VU_IDLE); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); + } + vcpu_unlock(vcpu); + + return (handled ? 0 : -1); +} + +static int +vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + const uint32_t code = vme->u.msr.code; + uint64_t val = 0; + + switch (code) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + val = 0; + break; + + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + val = 0; + break; + + case MSR_TSC: + /* + * In all likelihood, this should always be handled in guest + * context by VMX/SVM rather than taking an exit. (Both VMX and + * SVM pass through read-only access to MSR_TSC to the guest.) + * + * No physical offset is requested of vcpu_tsc_offset() since + * rdtsc_offset() takes care of that instead. + */ + val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); + break; + + default: + /* + * Anything not handled at this point will be kicked out to + * userspace for attempted processing there. + */ + return (-1); + } + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + val & 0xffffffff)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, + val >> 32)); + return (0); +} + +static int +vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + const uint32_t code = vme->u.msr.code; + const uint64_t val = vme->u.msr.wval; + + switch (code) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + /* Ignore writes */ + break; + + case MSR_MTRRcap: + vm_inject_gp(vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + /* Ignore writes */ + break; + + case MSR_TSC: + /* + * The effect of writing the TSC MSR is that a subsequent read + * of the TSC would report that value written (plus any time + * elapsed between the write and the read). The guest TSC value + * is calculated from a global offset for the guest (which + * effectively makes its TSC read 0 at guest boot) and a + * per-vCPU offset to handle these writes to the MSR. + * + * To calculate that per-vCPU offset, we can work backwards from + * the guest value at the time of write: + * + * value = host TSC + VM boot offset + vCPU offset + * + * so therefore: + * + * value - host TSC - VM boot offset = vCPU offset + */ + vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); + break; + + default: + /* + * Anything not handled at this point will be kicked out to + * userspace for attempted processing there. + */ + return (-1); + } + + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { + return (EALREADY); + } + + /* + * Notify all active vcpus that they are now suspended. + */ + for (uint_t i = 0; i < vm->maxcpus; i++) { + struct vcpu *vcpu = &vm->vcpu[i]; + + vcpu_lock(vcpu); + if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { + /* + * Any vCPUs not actively running or in HLT can be + * marked as suspended immediately. + */ + if (CPU_ISSET(i, &vm->active_cpus)) { + CPU_SET_ATOMIC(i, &vm->suspended_cpus); + } + } else { + /* + * Those which are running or in HLT will pick up the + * suspended state after notification. + */ + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + } + vcpu_unlock(vcpu); + } + return (0); +} + +void +vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RUN_STATE; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); +} + +/* + * Some vmm resources, such as the lapic, may have CPU-specific resources + * allocated to them which would benefit from migration onto the host CPU which + * is processing the vcpu state. + */ +static void +vm_localize_resources(struct vm *vm, struct vcpu *vcpu) +{ + /* + * Localizing cyclic resources requires acquisition of cpu_lock, and + * doing so with kpreempt disabled is a recipe for deadlock disaster. + */ + VERIFY(curthread->t_preempt == 0); + + /* + * Do not bother with localization if this vCPU is about to return to + * the host CPU it was last localized to. + */ + if (vcpu->lastloccpu == curcpu) + return; + + /* + * Localize system-wide resources to the primary boot vCPU. While any + * of the other vCPUs may access them, it keeps the potential interrupt + * footprint constrained to CPUs involved with this instance. + */ + if (vcpu == &vm->vcpu[0]) { + vhpet_localize_resources(vm->vhpet); + vrtc_localize_resources(vm->vrtc); + vatpit_localize_resources(vm->vatpit); + } + + vlapic_localize_resources(vcpu->vlapic); + + vcpu->lastloccpu = curcpu; +} + +static void +vmm_savectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + if (ops->vmsavectx != NULL) { + ops->vmsavectx(vm->cookie, vcpuid); + } + + /* + * Account for going off-cpu, unless the vCPU is idled, where being + * off-cpu is the explicit point. + */ + if (vm->vcpu[vcpuid].ustate != VU_IDLE) { + vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; + vcpu_ustate_change(vm, vcpuid, VU_SCHED); + } + + /* + * If the CPU holds the restored guest FPU state, save it and restore + * the host FPU state before this thread goes off-cpu. + */ + if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + save_guest_fpustate(vcpu); + vtc->vtc_status &= ~VTCS_FPU_RESTORED; + } +} + +static void +vmm_restorectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + /* Complete microstate accounting for vCPU being off-cpu */ + if (vm->vcpu[vcpuid].ustate != VU_IDLE) { + vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); + } + + /* + * When coming back on-cpu, only restore the guest FPU status if the + * thread is in a context marked as requiring it. This should be rare, + * occurring only when a future logic error results in a voluntary + * sleep during the VMRUN critical section. + * + * The common case will result in elision of the guest FPU state + * restoration, deferring that action until it is clearly necessary + * during vm_run. + */ + VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); + if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + restore_guest_fpustate(vcpu); + vtc->vtc_status |= VTCS_FPU_RESTORED; + } + + if (ops->vmrestorectx != NULL) { + ops->vmrestorectx(vm->cookie, vcpuid); + } + +} + +static int +vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, + struct vm_exit *vme) +{ + struct vcpu *vcpu; + struct vie *vie; + int err; + + vcpu = &vm->vcpu[vcpuid]; + vie = vcpu->vie_ctx; + err = 0; + + switch (entry->cmd) { + case VEC_DEFAULT: + return (0); + case VEC_DISCARD_INSTR: + vie_reset(vie); + return (0); + case VEC_FULFILL_MMIO: + err = vie_fulfill_mmio(vie, &entry->u.mmio); + if (err == 0) { + err = vie_emulate_mmio(vie, vm, vcpuid); + if (err == 0) { + vie_advance_pc(vie, &vcpu->nextrip); + } else if (err < 0) { + vie_exitinfo(vie, vme); + } else if (err == EAGAIN) { + /* + * Clear the instruction emulation state in + * order to re-enter VM context and continue + * this 'rep <instruction>' + */ + vie_reset(vie); + err = 0; + } + } + break; + case VEC_FULFILL_INOUT: + err = vie_fulfill_inout(vie, &entry->u.inout); + if (err == 0) { + err = vie_emulate_inout(vie, vm, vcpuid); + if (err == 0) { + vie_advance_pc(vie, &vcpu->nextrip); + } else if (err < 0) { + vie_exitinfo(vie, vme); + } else if (err == EAGAIN) { + /* + * Clear the instruction emulation state in + * order to re-enter VM context and continue + * this 'rep ins/outs' + */ + vie_reset(vie); + err = 0; + } + } + break; + default: + return (EINVAL); + } + return (err); +} + +static int +vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vie *vie; + + vie = vm->vcpu[vcpuid].vie_ctx; + + if (vie_pending(vie)) { + /* + * Userspace has not fulfilled the pending needs of the + * instruction emulation, so bail back out. + */ + vie_exitinfo(vie, vme); + return (-1); + } + + return (0); +} + +int +vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) +{ + int error; + struct vcpu *vcpu; + struct vm_exit *vme; + bool intr_disabled; + int affinity_type = CPU_CURRENT; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); + + vcpu->vtc.vtc_status = 0; + ctxop_attach(curthread, vcpu->ctxop); + + error = vm_entry_actions(vm, vcpuid, entry, vme); + if (error != 0) { + goto exit; + } + +restart: + error = vm_loop_checks(vm, vcpuid, vme); + if (error != 0) { + goto exit; + } + + thread_affinity_set(curthread, affinity_type); + /* + * Resource localization should happen after the CPU affinity for the + * thread has been set to ensure that access from restricted contexts, + * such as VMX-accelerated APIC operations, can occur without inducing + * cyclic cross-calls. + * + * This must be done prior to disabling kpreempt via critical_enter(). + */ + vm_localize_resources(vm, vcpu); + affinity_type = CPU_CURRENT; + critical_enter(); + + /* Force a trip through update_sregs to reload %fs/%gs and friends */ + PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); + + if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { + restore_guest_fpustate(vcpu); + vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; + } + vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; + + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + + /* + * Once clear of the delicate contexts comprising the VM_RUN handler, + * thread CPU affinity can be loosened while other processing occurs. + */ + vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; + thread_affinity_clear(curthread); + critical_exit(); + + if (error != 0) { + /* Communicate out any error from VMRUN() above */ + goto exit; + } + + vcpu->nextrip = vme->rip + vme->inst_length; + switch (vme->exitcode) { + case VM_EXITCODE_REQIDLE: + error = vm_handle_reqidle(vm, vcpuid); + break; + case VM_EXITCODE_RUN_STATE: + error = vm_handle_run_state(vm, vcpuid); + break; + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled); + break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid); + break; + case VM_EXITCODE_MMIO_EMUL: + error = vm_handle_mmio_emul(vm, vcpuid); + break; + case VM_EXITCODE_INOUT: + error = vm_handle_inout(vm, vcpuid, vme); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid); + break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: + vm_inject_ud(vm, vcpuid); + break; + case VM_EXITCODE_RDMSR: + error = vm_handle_rdmsr(vm, vcpuid, vme); + break; + case VM_EXITCODE_WRMSR: + error = vm_handle_wrmsr(vm, vcpuid, vme); + break; + case VM_EXITCODE_HT: + affinity_type = CPU_BEST; + break; + case VM_EXITCODE_MTRAP: + vm_suspend_cpu(vm, vcpuid); + error = -1; + break; + default: + /* handled in userland */ + error = -1; + break; + } + + if (error == 0) { + /* VM exit conditions handled in-kernel, continue running */ + goto restart; + } + +exit: + kpreempt_disable(); + ctxop_detach(curthread, vcpu->ctxop); + /* Make sure all of the needed vCPU context state is saved */ + vmm_savectx(&vcpu->vtc); + kpreempt_enable(); + + VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); + + vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %lx to %lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)", + info1, info2); + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exc_vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exc_errcode_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exc_errcode << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && + vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx", + vcpu->exc_vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), " + "retinfo(%lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + uint64_t regval; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * NMIs (which bear an exception vector of 2) are to be injected via + * their own specialized path using vm_inject_nmi(). + */ + if (vector == 2) { + return (EINVAL); + } + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->exception_pending) { + VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " + "pending exception %d", vector, vcpu->exc_vector); + return (EBUSY); + } + + if (errcode_valid) { + /* + * Exceptions don't deliver an error code in real mode. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); + KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); + if (!(regval & CR0_PE)) + errcode_valid = 0; + } + + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); + + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); +} + +void +vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid, + int errcode) +{ + int error; + + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, 1); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); +} + +void +vm_inject_ud(struct vm *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +void +vm_inject_gp(struct vm *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +void +vm_inject_ac(struct vm *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +void +vm_inject_ss(struct vm *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void +vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) +{ + int error; + + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vcpu_notify_event(vm, vcpuid); + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->extint_pending = 1; + vcpu_notify_event(vm, vcpuid); + return (0); +} + +int +vm_extint_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->extint_pending == 0) + panic("vm_extint_clear: inconsistent extint_pending state"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); +} + +int +vm_inject_init(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_INIT; + /* + * As part of queuing the INIT request, clear any pending SIPI. It + * would not otherwise survive across the reset of the vCPU when it + * undergoes the requested INIT. We would not want it to linger when it + * could be mistaken as a subsequent (after the INIT) SIPI request. + */ + vcpu->run_state &= ~VRS_PEND_SIPI; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + + vcpu_unlock(vcpu); + return (0); +} + +int +vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_SIPI; + vcpu->sipi_vector = vector; + /* SIPI is only actionable if the CPU is waiting in INIT state */ + if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + } + vcpu_unlock(vcpu); + return (0); +} + +bool +vcpu_run_state_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + vcpu = &vm->vcpu[vcpuid]; + + /* Of interest: vCPU not in running state or with pending INIT */ + return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); +} + +int +vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) +{ + struct seg_desc desc; + const enum vm_reg_name clear_regs[] = { + VM_REG_GUEST_CR2, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_EFER, + }; + const enum vm_reg_name data_segs[] = { + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + for (uint_t i = 0; i < nitems(clear_regs); i++) { + VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); + } + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); + + /* + * The prescribed contents of %rdx differ slightly between the Intel and + * AMD architectural definitions. The former expects the Extended Model + * in bits 16-19 where the latter expects all the Family, Model, and + * Stepping be there. Common boot ROMs appear to disregard this + * anyways, so we stick with a compromise value similar to what is + * spelled out in the Intel SDM. + */ + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0xffff0000; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); + + /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0; + desc.limit = 0xffff; + for (uint_t i = 0; i < nitems(data_segs); i++) { + VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); + VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); + } + + /* GDTR, IDTR */ + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); + + /* LDTR: Present, LDT */ + desc.access = 0x0082; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); + + /* TR: Present, 32-bit TSS */ + desc.access = 0x008b; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); + + vlapic_reset(vm_lapic(vm, vcpuid)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); + + vcpu->exitintinfo = 0; + vcpu->exception_pending = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + + /* + * A CPU reset caused by power-on or system reset clears more state than + * one which is trigged from an INIT IPI. + */ + if (!init_only) { + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + hma_fpu_init(vcpu->guestfpu); + + /* XXX: clear MSRs and other pieces */ + } + + return (0); +} + +static int +vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct seg_desc desc; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = (uint64_t)vector << 12; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, + (uint64_t)vector << 8)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); + + return (0); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + + return (vm->vioapic); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + + return (vm->vhpet); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vcpu_set_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vcpu_get_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +uint64_t +vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) +{ + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + + uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; + + if (phys_adj) { + /* Include any offset for the current physical CPU too */ + extern hrtime_t tsc_gethrtime_tick_delta(void); + vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); + } + + return (vcpu_off); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + if (vm->suspend != 0) { + return (EBUSY); + } + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + + /* + * It is possible that this vCPU was undergoing activation at the same + * time that the VM was being suspended. If that happens to be the + * case, it should reflect the suspended state immediately. + */ + if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + } + + return (0); +} + +int +vm_suspend_cpu(struct vm *vm, int vcpuid) +{ + int i; + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + vm->debug_cpus = vm->active_cpus; + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i); + } + } else { + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + vcpu_notify_event(vm, vcpuid); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + } + return (0); +} + +static bool +vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, + uint64_t entry_rip) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + struct vm_exit *vme = &vcpu->exitinfo; + bool bail = false; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + + if (vm->suspend) { + if (on_entry) { + VERIFY(vm->suspend > VM_SUSPEND_NONE && + vm->suspend < VM_SUSPEND_LAST); + + vme->exitcode = VM_EXITCODE_SUSPENDED; + vme->u.suspended.how = vm->suspend; + } else { + /* + * Handling VM suspend is complicated, so if that + * condition is detected outside of VM-entry itself, + * just emit a BOGUS exitcode so we take a lap to pick + * up the event during an entry and are directed into + * the vm_handle_suspend() logic. + */ + vme->exitcode = VM_EXITCODE_BOGUS; + } + bail = true; + } + if (vcpu->reqidle) { + vme->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); + + if (!on_entry) { + /* + * A reqidle request detected outside of VM-entry can be + * handled directly by clearing the request (and taking + * a lap to userspace). + */ + vcpu_assert_locked(vcpu); + vcpu->reqidle = 0; + } + bail = true; + } + if (vcpu_should_yield(vm, vcpuid)) { + vme->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); + bail = true; + } + if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { + vme->exitcode = VM_EXITCODE_DEBUG; + bail = true; + } + + if (bail) { + if (on_entry) { + /* + * If bailing out during VM-entry, the current %rip must + * be recorded in the exitinfo. + */ + vme->rip = entry_rip; + } + vme->inst_length = 0; + } + return (bail); +} + +static bool +vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) +{ + /* + * Bail-out check done prior to sleeping (in vCPU contexts like HLT or + * wait-for-SIPI) expect that %rip is already populated in the vm_exit + * structure, and we would only modify the exitcode. + */ + return (vcpu_bailout_checks(vm, vcpuid, false, 0)); +} + +bool +vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) +{ + /* + * Bail-out checks done as part of VM entry require an updated %rip to + * populate the vm_exit struct if any of the conditions of interest are + * matched in the check. + */ + return (vcpu_bailout_checks(vm, vcpuid, true, rip)); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + vlapic_set_x2apic_state(vm, vcpuid, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) +{ + int hostcpu; + + ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); + + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + if (ntype == VCPU_NOTIFY_APIC) { + vlapic_post_intr(vcpu->vlapic, hostcpu); + } else { + poke_cpu(hostcpu); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) { + cv_signal(&vcpu->vcpu_cv); + } + } +} + +void +vcpu_notify_event(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); +} + +void +vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + if (ntype == VCPU_NOTIFY_NONE) { + return; + } + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, ntype); + vcpu_unlock(vcpu); +} + +void +vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + hrtime_t now = gethrtime(); + + ASSERT3U(ustate, !=, vcpu->ustate); + ASSERT3S(ustate, <, VU_MAX); + ASSERT3S(ustate, >=, VU_INIT); + + hrtime_t delta = now - vcpu->ustate_when; + vcpu->ustate_total[vcpu->ustate] += delta; + + membar_producer(); + + vcpu->ustate_when = now; + vcpu->ustate = ustate; +} + +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} + +struct vm_client * +vm_get_vmclient(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid].vmclient); +} + +int +vm_apicid2vcpuid(struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + uint_t num_copyinfo) +{ + for (uint_t idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) { + vmp_release((vm_page_t *)copyinfo[idx].cookie); + } + } + bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + uint_t num_copyinfo, int *fault) +{ + uint_t idx, nused; + size_t n, off, remaining; + vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); + + bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + uint64_t gpa; + int error; + + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + off = gpa & PAGEOFFSET; + n = min(remaining, PAGESIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + vm_page_t *vmp; + caddr_t hva; + + vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); + if (vmp == NULL) { + break; + } + if ((prot & PROT_WRITE) != 0) { + hva = (caddr_t)vmp_get_writable(vmp); + } else { + hva = (caddr_t)vmp_get_readable(vmp); + } + copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); + copyinfo[idx].cookie = vmp; + copyinfo[idx].prot = prot; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (EFAULT); + } else { + *fault = 0; + return (0); + } +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, + size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + ASSERT(copyinfo[idx].prot & PROT_READ); + + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + ASSERT(copyinfo[idx].prot & PROT_WRITE); + + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); + +static void +vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, + PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); + +int +vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val) +{ + return (vm_inout_access(&vm->ioports, in, port, bytes, val)); +} + +/* + * bhyve-internal interfaces to attach or detach IO port handlers. + * Must be called with VM write lock held for safety. + */ +int +vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, + void **cookie) +{ + int err; + err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); + if (err == 0) { + *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); + } + return (err); +} +int +vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, + void **old_arg) +{ + uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); + int err; + + err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); + if (err == 0) { + *cookie = NULL; + } + return (err); +} + +/* + * External driver interfaces to attach or detach IO port handlers. + * Must be called with VM write lock held for safety. + */ +int +vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, + void *arg, void **cookie) +{ + int err; + + if (port == 0) { + return (EINVAL); + } + + err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); + if (err == 0) { + *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); + } + return (err); +} +void +vm_ioport_unhook(struct vm *vm, void **cookie) +{ + uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); + ioport_handler_t old_func; + void *old_arg; + int err; + + err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); + + /* ioport-hook-using drivers are expected to be well-behaved */ + VERIFY0(err); + VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); + + *cookie = NULL; +} + +int +vmm_kstat_update_vcpu(struct kstat *ksp, int rw) +{ + struct vm *vm = ksp->ks_private; + vmm_vcpu_kstats_t *vvk = ksp->ks_data; + const int vcpuid = vvk->vvk_vcpu.value.ui32; + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + ASSERT3U(vcpuid, <, VM_MAXCPU); + + vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; + vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; + vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; + vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; + vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; + vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; + + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/vmm.conf b/usr/src/uts/intel/io/vmm/vmm.conf new file mode 100644 index 0000000000..8833076014 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm.conf @@ -0,0 +1 @@ +name="vmm" parent="pseudo"; diff --git a/usr/src/uts/intel/io/vmm/vmm.mapfile b/usr/src/uts/intel/io/vmm/vmm.mapfile new file mode 100644 index 0000000000..fb1c9366de --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm.mapfile @@ -0,0 +1,70 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE + +# +# Copyright 2019 Joyent, Inc. +# Copyright 2021 Oxide Computer Company +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # bhyve driver API + vmm_drv_hold; + vmm_drv_rele; + vmm_drv_release_reqd; + vmm_drv_lease_sign; + vmm_drv_lease_break; + vmm_drv_lease_expired; + vmm_drv_page_hold; + vmm_drv_page_release; + vmm_drv_page_release_chain; + vmm_drv_page_readable; + vmm_drv_page_writable; + vmm_drv_page_chain; + vmm_drv_page_next; + vmm_drv_ioport_hook; + vmm_drv_ioport_unhook; + vmm_drv_msi; + + # IOMMU API for PCI pass-thru + iommu_add_device; + iommu_host_domain; + iommu_remove_device; + lapic_intr_msi; + vm_iommu_domain; + vm_map_mmio; + vm_unmap_mmio; + + local: + *; +}; diff --git a/usr/src/uts/intel/io/vmm/vmm_gpt.c b/usr/src/uts/intel/io/vmm/vmm_gpt.c new file mode 100644 index 0000000000..586b994d56 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_gpt.c @@ -0,0 +1,586 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/atomic.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/panic.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/hat_i86.h> + +#include <sys/vmm_gpt.h> + +/* + * VMM Generic Page Tables + * + * Bhyve runs on AMD and Intel hosts and both support nested page tables + * describing the guest's physical address space. But the two use different and + * mutually incompatible page table formats: Intel uses the EPT, which is based + * on the Itanium page table format, while AMD uses the nPT, which is based on + * the x86_64 page table format. + * + * The GPT abstracts these format differences, and provides a single interface + * for interacting with either kind of table structure. + * + * At a high-level, the GPT is a tree that mirrors the paging table radix tree. + * It is parameterized with operations on PTEs that are specific to the table + * type (EPT or nPT) and also keeps track of how many pages the table maps, as + * well as a pointer to the root node in the tree. + * + * A node in the GPT keep pointers to its parent (NULL for the root), its + * left-most child, and its rightward siblings. The node understands its + * position in the tree in terms of its level it appears at and the index it + * occupies at its parent's level, as well as how many children it has. It also + * owns the physical memory page for the hardware page table entries that map + * its children. Thus, for a node at any given level in the tree, the nested + * PTE for that node's child at index $i$ is the i'th uint64_t in that node's + * entry page and the entry page is part of the paging structure consumed by + * hardware. + * + * The GPT interface provides functions for populating and vacating the tree for + * regions in the guest physical address space, and for mapping and unmapping + * pages in populated regions. Users must populate a region before mapping + * pages into it, and must unmap pages before vacating the region. + * + * The interface also exposes a function for walking the table from the root to + * a leaf entry, populating an array of pointers to PTEs. This walk uses the + * hardware page structure itself, and is thus fast, though as a result it + * potentially aliases entries; caveat emptor. The walk primitive is used for + * mapping, unmapping, and lookups. + * + * Format-specific differences are abstracted by parameterizing the GPT with a + * set of PTE operations specific to the platform. The GPT code makes use of + * these when mapping or populating entries, resetting accessed and dirty bits + * on entries, and similar operations. + */ + +/* + * A GPT node. + * + * Each node contains pointers to its parent, its left-most child, and its + * rightward siblings. Interior nodes also maintain a reference count, and + * each node contains its level and index in its parent's table. Finally, + * each node contains the host PFN of the page that it links into the page + * table, as well as a kernel pointer to table. + * + * On leaf nodes, the reference count tracks how many entries in the table are + * covered by mapping from the containing vmspace. This is maintained during + * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace + * map/unmap operations, rather than in the data path of faults populating the + * PTEs themselves. + * + * Note, this is carefully sized to fit exactly into a 64-byte cache line. + */ +typedef struct vmm_gpt_node vmm_gpt_node_t; +struct vmm_gpt_node { + uint64_t vgn_host_pfn; + uint16_t vgn_level; + uint16_t vgn_index; + uint32_t vgn_ref_cnt; + vmm_gpt_node_t *vgn_parent; + vmm_gpt_node_t *vgn_children; + vmm_gpt_node_t *vgn_siblings; + uint64_t *vgn_entries; + uint64_t vgn_gpa; + uint64_t _vgn_pad; +}; + +/* + * A VMM Generic Page Table. + * + * The generic page table is a format-agnostic, 4-level paging structure + * modeling a second-level page table (EPT on Intel, nPT on AMD). It + * contains a counter of pages the table maps, a pointer to the root node + * in the table, and is parameterized with a set of PTE operations specific + * to the table type. + */ +struct vmm_gpt { + vmm_gpt_node_t *vgpt_root; + vmm_pte_ops_t *vgpt_pte_ops; +}; + +/* + * VMM Guest Page Tables + */ + +/* + * Allocates a vmm_gpt_node_t structure with corresponding page of memory to + * hold the PTEs it contains. + */ +static vmm_gpt_node_t * +vmm_gpt_node_alloc(void) +{ + vmm_gpt_node_t *node; + caddr_t page; + + node = kmem_zalloc(sizeof (*node), KM_SLEEP); + /* + * Note: despite the man page, allocating PAGESIZE bytes is + * guaranteed to be page-aligned. + */ + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + node->vgn_entries = (uint64_t *)page; + node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); + + return (node); +} + +/* + * Allocates and initializes a vmm_gpt_t. + */ +vmm_gpt_t * +vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) +{ + vmm_gpt_t *gpt; + + VERIFY(pte_ops != NULL); + gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); + gpt->vgpt_pte_ops = pte_ops; + gpt->vgpt_root = vmm_gpt_node_alloc(); + + return (gpt); +} + +/* + * Frees the given node, first nulling out all of its links to other nodes in + * the tree, adjusting its parents reference count, and unlinking itself from + * its parents page table. + */ +static void +vmm_gpt_node_free(vmm_gpt_node_t *node) +{ + ASSERT(node != NULL); + ASSERT3U(node->vgn_ref_cnt, ==, 0); + ASSERT(node->vgn_host_pfn != PFN_INVALID); + ASSERT(node->vgn_entries != NULL); + if (node->vgn_parent != NULL) { + uint64_t *parent_entries = node->vgn_parent->vgn_entries; + parent_entries[node->vgn_index] = 0; + node->vgn_parent->vgn_ref_cnt--; + } + kmem_free(node->vgn_entries, PAGESIZE); + kmem_free(node, sizeof (*node)); +} + +/* + * Frees the portion of the radix tree rooted at the given node. + */ +static void +vmm_gpt_node_tree_free(vmm_gpt_node_t *node) +{ + ASSERT(node != NULL); + + for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL; + child != NULL; + child = next) { + next = child->vgn_siblings; + vmm_gpt_node_tree_free(child); + } + vmm_gpt_node_free(node); +} + +/* + * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries + * it refers to. + */ +void +vmm_gpt_free(vmm_gpt_t *gpt) +{ + vmm_gpt_node_tree_free(gpt->vgpt_root); + kmem_free(gpt, sizeof (*gpt)); +} + +/* + * Return the index in the paging structure for the given level. + */ +static inline uint16_t +vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level) +{ + const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 }; + const uint_t MASK = (1U << 9) - 1; + ASSERT(level < MAX_GPT_LEVEL); + return ((gpa >> SHIFTS[level]) & MASK); +} + +/* + * Finds the child for the given GPA in the given parent node. + * Returns a pointer to node, or NULL if it is not found. + */ +static vmm_gpt_node_t * +vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) +{ + if (parent == NULL) + return (NULL); + + const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level); + for (vmm_gpt_node_t *child = parent->vgn_children; + child != NULL && child->vgn_index <= index; + child = child->vgn_siblings) { + if (child->vgn_index == index) + return (child); + } + + return (NULL); +} + +/* + * Walks the GPT for the given GPA, accumulating entries to the given depth. If + * the walk terminates before the depth is reached, the remaining entries are + * written with NULLs. + */ +void +vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, + enum vmm_gpt_node_level depth) +{ + uint64_t *current_entries, entry; + pfn_t pfn; + + ASSERT(gpt != NULL); + current_entries = gpt->vgpt_root->vgn_entries; + for (uint_t i = 0; i < depth; i++) { + if (current_entries == NULL) { + entries[i] = NULL; + continue; + } + entries[i] = ¤t_entries[vmm_gpt_node_index(gpa, i)]; + entry = *entries[i]; + if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { + current_entries = NULL; + continue; + } + pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); + current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); + } +} + +/* + * Looks up an entry given GPA. + */ +uint64_t * +vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) +{ + uint64_t *entries[MAX_GPT_LEVEL]; + + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + + return (entries[LEVEL1]); +} + +/* + * Adds a node for the given GPA to the GPT as a child of the given parent. + */ +static void +vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, + uint64_t gpa) +{ + vmm_gpt_node_t **prevp; + vmm_gpt_node_t *node; + uint64_t *parent_entries, entry; + + ASSERT(gpt != NULL); + ASSERT(gpt->vgpt_pte_ops != NULL); + ASSERT(parent != NULL); + ASSERT(child != NULL); + ASSERT3U(parent->vgn_level, <, LEVEL1); + + const uint64_t gpa_mask[3] = { + [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */ + [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */ + [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */ + }; + const int index = vmm_gpt_node_index(gpa, parent->vgn_level); + child->vgn_index = index; + child->vgn_level = parent->vgn_level + 1; + child->vgn_parent = parent; + child->vgn_gpa = gpa & gpa_mask[parent->vgn_level]; + parent_entries = parent->vgn_entries; + entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); + parent_entries[index] = entry; + + for (prevp = &parent->vgn_children, node = parent->vgn_children; + node != NULL; + prevp = &node->vgn_siblings, node = node->vgn_siblings) { + if (node->vgn_index > child->vgn_index) { + break; + } + } + if (node != NULL) + ASSERT3U(node->vgn_index, !=, child->vgn_index); + child->vgn_siblings = node; + *prevp = child; + parent->vgn_ref_cnt++; +} + +/* + * Populate the GPT with nodes so that a entries for the given GPA exist. Note + * that this does not actually map the entry, but simply ensures that the + * entries exist. + */ +static void +vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) +{ + vmm_gpt_node_t *node, *child; + + ASSERT(gpt != NULL); + ASSERT0(gpa & PAGEOFFSET); + + node = gpt->vgpt_root; + for (uint_t i = 0; i < LEVEL1; i++) { + ASSERT(node != NULL); + child = vmm_gpt_node_find_child(node, gpa); + if (child == NULL) { + child = vmm_gpt_node_alloc(); + ASSERT(child != NULL); + vmm_gpt_add_child(gpt, node, child, gpa); + } + node = child; + } + + /* + * Bump the reference count for this leaf for the PTE that is now usable + * by the mapping covering its GPA. + */ + ASSERT3U(node->vgn_level, ==, LEVEL1); + ASSERT3U(node->vgn_ref_cnt, <, 512); + node->vgn_ref_cnt++; +} + +/* + * Ensures that PTEs for the region of address space bounded by + * [start, end) exist in the tree. + */ +void +vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + + for (uint64_t page = start; page < end; page += PAGESIZE) { + vmm_gpt_populate_entry(gpt, page); + } +} + +/* + * Format a PTE and install it in the provided PTE-pointer. + */ +bool +vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, + uint8_t attr) +{ + uint64_t entry, old_entry; + + entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); + old_entry = atomic_cas_64(ptep, 0, entry); + if (old_entry != 0) { + ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, + gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); + return (false); + } + + return (true); +} + +/* + * Inserts an entry for a given GPA into the table. The caller must + * ensure that a conflicting PFN is not mapped at the requested location. + * Racing operations to map the same PFN at one location is acceptable and + * properly handled. + */ +bool +vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) +{ + uint64_t *entries[MAX_GPT_LEVEL]; + + ASSERT(gpt != NULL); + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + ASSERT(entries[LEVEL1] != NULL); + + return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); +} + +/* + * Removes a child node from its parent's list of children, and then frees + * the now-orphaned child. + */ +static void +vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) +{ + ASSERT(parent != NULL); + + ASSERT3P(child->vgn_children, ==, NULL); + vmm_gpt_node_t **prevp = &parent->vgn_children; + for (vmm_gpt_node_t *node = parent->vgn_children; + node != NULL; + prevp = &node->vgn_siblings, node = node->vgn_siblings) { + if (node == child) { + *prevp = node->vgn_siblings; + vmm_gpt_node_free(node); + break; + } + } +} + +/* + * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding + * to the entry does not map any additional pages. + */ +static void +vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) +{ + vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node; + + node = gpt->vgpt_root; + for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) { + nodes[i] = node; + node = vmm_gpt_node_find_child(node, gpa); + } + for (uint_t i = LEVEL1; i > 0; i--) { + node = nodes[i]; + + if (node == NULL) + continue; + + if (i == LEVEL1) { + ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]); + ASSERT3U(node->vgn_ref_cnt, !=, 0); + + /* + * Just as vmm_gpt_populate_entry() increments the + * reference count for leaf PTEs which become usable, + * here we decrement it as they become unusable as the + * mapping covering its GPA is removed. + */ + node->vgn_ref_cnt--; + } + + if (node->vgn_ref_cnt != 0) + break; + vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]); + } +} + +/* + * Cleans up the unused inner nodes in the GPT for a region of guest physical + * address space of [start, end). The region must map no pages. + */ +void +vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + + for (uint64_t page = start; page < end; page += PAGESIZE) { + vmm_gpt_vacate_entry(gpt, page); + } +} + +/* + * Remove a mapping from the table. Returns false if the page was not mapped, + * otherwise returns true. + */ +bool +vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) +{ + uint64_t *entries[MAX_GPT_LEVEL], entry; + + ASSERT(gpt != NULL); + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + if (entries[LEVEL1] == NULL) + return (false); + + entry = *entries[LEVEL1]; + *entries[LEVEL1] = 0; + return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); +} + +/* + * Un-maps the region of guest physical address space bounded by [start..end). + * Returns the number of pages that are unmapped. + */ +size_t +vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) +{ + ASSERT0(start & PAGEOFFSET); + ASSERT0(end & PAGEOFFSET); + + size_t num_unmapped = 0; + for (uint64_t page = start; page < end; page += PAGESIZE) { + if (vmm_gpt_unmap(gpt, page) != 0) { + num_unmapped++; + } + } + + return (num_unmapped); +} + +/* + * Returns a value indicating whether or not this GPT maps the given + * GPA. If the GPA is mapped, *protp will be filled with the protection + * bits of the entry. Otherwise, it will be ignored. + */ +bool +vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) +{ + uint64_t entry; + + if (ptep == NULL) { + return (false); + } + entry = *ptep; + if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { + return (false); + } + *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); + *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); + return (true); +} + +/* + * Resets the accessed bit on the page table entry pointed to be `entry`. + * If `on` is true, the bit will be set, otherwise it will be cleared. + * The old value of the bit is returned. + */ +uint_t +vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) +{ + ASSERT(entry != NULL); + return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); +} + +/* + * Resets the dirty bit on the page table entry pointed to be `entry`. + * If `on` is true, the bit will be set, otherwise it will be cleared. + * The old value of the bit is returned. + */ +uint_t +vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) +{ + ASSERT(entry != NULL); + return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); +} + +/* + * Get properly formatted PML4 (EPTP/nCR3) for GPT. + */ +uint64_t +vmm_gpt_get_pmtp(vmm_gpt_t *gpt) +{ + return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn)); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_host.c b/usr/src/uts/intel/io/vmm/vmm_host.c new file mode 100644 index 0000000000..2c1897b18f --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_host.c @@ -0,0 +1,181 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pcpu.h> + +#include <machine/cpufunc.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; + +void +vmm_host_state_init(void) +{ + unsigned int regs[4]; + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + /* + * On non-PCID or PCID but without INVPCID support machines, + * we flush kernel i.e. global TLB entries, by temporary + * clearing the CR4.PGE bit, see invltlb_glob(). If + * preemption occurs at the wrong time, cached vmm_host_cr4 + * might store the value with CR4.PGE cleared. Since FreeBSD + * requires support for PG_G on amd64, just set it + * unconditionally. + */ + vmm_host_cr4 = rcr4() | CR4_PGE; + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + +uint64_t +vmm_get_host_datasel(void) +{ + return (SEL_GDT(GDT_KDATA, SEL_KPL)); +} + +uint64_t +vmm_get_host_codesel(void) +{ + return (SEL_GDT(GDT_KCODE, SEL_KPL)); +} + +uint64_t +vmm_get_host_tsssel(void) +{ + return (SEL_GDT(GDT_KTSS, SEL_KPL)); +} + +uint64_t +vmm_get_host_fsbase(void) +{ + return (rdmsr(MSR_FSBASE)); +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + desctbr_t idtr; + + rd_idtr(&idtr); + return (idtr.dtr_base); +} + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_host.h b/usr/src/uts/intel/io/vmm/vmm_host.h new file mode 100644 index 0000000000..c5688f108a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_host.h @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef __FreeBSD__ +#include <sys/cpuvar.h> +#endif + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + return ((uint64_t)CPU->cpu_tss); +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)*PCPU_PTR(gdt)); +#else + desctbr_t gdtr; + + rd_gdtr(&gdtr); + return (gdtr.dtr_base); +#endif +} + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + +#ifdef __FreeBSD__ + return ((uint64_t)get_pcpu()); +#else + return (rdmsr(MSR_GSBASE)); +#endif +} + +#ifndef __FreeBSD__ +static __inline uint64_t +vmm_get_host_fssel(void) +{ + return (KFS_SEL); +} + +static __inline uint64_t +vmm_get_host_gssel(void) +{ + return (KGS_SEL); +} +#endif +#endif diff --git a/usr/src/uts/intel/io/vmm/vmm_instruction_emul.c b/usr/src/uts/intel/io/vmm/vmm_instruction_emul.c new file mode 100644 index 0000000000..06baec53bf --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_instruction_emul.c @@ -0,0 +1,3839 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <machine/vmparam.h> +#include <machine/vmm.h> +#include <sys/vmm_kernel.h> +#include <sys/vmm_vm.h> + +#include <sys/vmm_instruction_emul.h> +#include <x86/psl.h> +#include <x86/specialreg.h> + +#include "vmm_ioport.h" +#include "vmm_ktr.h" + +enum vie_status { + VIES_INIT = (1U << 0), + VIES_MMIO = (1U << 1), + VIES_INOUT = (1U << 2), + VIES_OTHER = (1U << 3), + VIES_INST_FETCH = (1U << 4), + VIES_INST_DECODE = (1U << 5), + VIES_PENDING_MMIO = (1U << 6), + VIES_PENDING_INOUT = (1U << 7), + VIES_REPEAT = (1U << 8), + VIES_USER_FALLBACK = (1U << 9), + VIES_COMPLETE = (1U << 10), +}; + +/* State of request to perform emulated access (inout or MMIO) */ +enum vie_req { + VR_NONE, + VR_PENDING, + VR_DONE, +}; + +struct vie_mmio { + uint64_t data; + uint64_t gpa; + uint8_t bytes; + enum vie_req state; +}; + +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + vex_present:1, /* VEX prefixed */ + vex_l:1, /* L bit */ + index:4, /* SIB byte */ + base:4; /* SIB byte */ + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + + uint8_t vex_reg:4, /* vvvv: first source reg specifier */ + vex_pp:2, /* pp */ + _sparebits:2; + + uint8_t _sparebytes[2]; + + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + struct vie_op op; /* opcode description */ + + enum vie_status status; + + struct vm_guest_paging paging; /* guest paging state */ + + uint64_t mmio_gpa; /* faulting GPA */ + struct vie_mmio mmio_req_read; + struct vie_mmio mmio_req_write; + + struct vm_inout inout; /* active in/out op */ + enum vie_req inout_req_state; + uint32_t inout_req_val; /* value from userspace */ +}; + + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_MOV_CR, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_TWOB_GRP15, + VIE_OP_TYPE_ADD, + VIE_OP_TYPE_TEST, + VIE_OP_TYPE_BEXTR, + VIE_OP_TYPE_CLTS, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) +#define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */ + +static const struct vie_op three_byte_opcodes_0f38[256] = { + [0xF7] = { + .op_byte = 0xF7, + .op_type = VIE_OP_TYPE_BEXTR, + }, +}; + +static const struct vie_op two_byte_opcodes[256] = { + [0x06] = { + .op_byte = 0x06, + .op_type = VIE_OP_TYPE_CLTS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0x20] = { + .op_byte = 0x20, + .op_type = VIE_OP_TYPE_MOV_CR, + .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0x22] = { + .op_byte = 0x22, + .op_type = VIE_OP_TYPE_MOV_CR, + .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAE] = { + .op_byte = 0xAE, + .op_type = VIE_OP_TYPE_TWOB_GRP15, + }, + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x03] = { + .op_byte = 0x03, + .op_type = VIE_OP_TYPE_ADD, + }, + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x0B] = { + .op_byte = 0x0B, + .op_type = VIE_OP_TYPE_OR, + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x39] = { + .op_byte = 0x39, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x81] = { + /* Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xF6] = { + /* XXX Group 3 extended opcode - not just TEST */ + .op_byte = 0xF6, + .op_type = VIE_OP_TYPE_TEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xF7] = { + /* XXX Group 3 extended opcode - not just TEST */ + .op_byte = 0xF7, + .op_type = VIE_OP_TYPE_TEST, + .op_flags = VIE_OP_F_IMM, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + + +/* + * Paging defines, previously pulled in from machine/pmap.h + */ +#define PG_V (1 << 0) /* Present */ +#define PG_RW (1 << 1) /* Read/Write */ +#define PG_U (1 << 2) /* User/Supervisor */ +#define PG_A (1 << 5) /* Accessed */ +#define PG_M (1 << 6) /* Dirty */ +#define PG_PS (1 << 7) /* Largepage */ + +/* + * Paging except defines, previously pulled in from machine/pmap.h + */ +#define PGEX_P (1 << 0) /* Non-present/Protection */ +#define PGEX_W (1 << 1) /* Read/Write */ +#define PGEX_U (1 << 2) /* User/Supervisor */ +#define PGEX_RSV (1 << 3) /* (Non-)Reserved */ +#define PGEX_I (1 << 4) /* Instruction */ + + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static enum vm_reg_name cr_map[16] = { + VM_REG_GUEST_CR0, + VM_REG_LAST, + VM_REG_GUEST_CR2, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST, + VM_REG_LAST +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + + +static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, + uint64_t gpa, uint64_t *rval, int bytes); +static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, + uint64_t gpa, uint64_t wval, int bytes); +static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla); +static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); +static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, + uint64_t gla); +static uint64_t vie_size2mask(int size); + +struct vie * +vie_alloc() +{ + return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); +} + +void +vie_free(struct vie *vie) +{ + kmem_free(vie, sizeof (struct vie)); +} + +enum vm_reg_name +vie_regnum_map(uint8_t regnum) +{ + VERIFY3U(regnum, <, 16); + return (gpr_map[regnum]); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } + return (error); +} + +static int +vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +static int +vie_repeat(struct vie *vie) +{ + vie->status |= VIES_REPEAT; + + /* + * Clear out any cached operation values so the repeated instruction can + * begin without using that stale state. Other state, such as the + * decoding results, are kept around as it will not vary between + * iterations of a rep-prefixed instruction. + */ + if ((vie->status & VIES_MMIO) != 0) { + vie->mmio_req_read.state = VR_NONE; + vie->mmio_req_write.state = VR_NONE; + } else if ((vie->status & VIES_INOUT) != 0) { + vie->inout_req_state = VR_NONE; + } else { + panic("unexpected emulation state"); + } + + return (EAGAIN); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +/* BEGIN CSTYLED */ +#define GETCC(sz) \ +static ulong_t \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + ulong_t rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack +/* END CSTYLED */ + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static ulong_t +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(x, y)); + else if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + +/* + * Macro creation of functions getaddflags{8,16,32,64} + */ +/* BEGIN CSTYLED */ +#define GETADDFLAGS(sz) \ +static ulong_t \ +getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + ulong_t rflags; \ + \ + __asm __volatile("add %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack +/* END CSTYLED */ + +GETADDFLAGS(8); +GETADDFLAGS(16); +GETADDFLAGS(32); +GETADDFLAGS(64); + +static ulong_t +getaddflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getaddflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getaddflags8(x, y)); + else if (opsize == 2) + return (getaddflags16(x, y)); + else if (opsize == 4) + return (getaddflags32(x, y)); + else + return (getaddflags64(x, y)); +} + +/* + * Return the status flags that would result from doing (x & y). + */ +/* BEGIN CSTYLED */ +#define GETANDFLAGS(sz) \ +static ulong_t \ +getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + ulong_t rflags; \ + \ + __asm __volatile("and %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack +/* END CSTYLED */ + +GETANDFLAGS(8); +GETANDFLAGS(16); +GETANDFLAGS(32); +GETANDFLAGS(64); + +static ulong_t +getandflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getandflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getandflags8(x, y)); + else if (opsize == 2) + return (getandflags16(x, y)); + else if (opsize == 4) + return (getandflags32(x, y)); + else + return (getandflags64(x, y)); +} + +static int +vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid) +{ + uint64_t val; + int err; + enum vm_reg_name gpr = gpr_map[vie->rm]; + enum vm_reg_name cr = cr_map[vie->reg]; + + uint_t size = 4; + if (vie->paging.cpu_mode == CPU_MODE_64BIT) { + size = 8; + } + + switch (vie->op.op_byte) { + case 0x20: + /* + * MOV control register (ModRM:reg) to reg (ModRM:r/m) + * 20/r: mov r32, CR0-CR7 + * 20/r: mov r64, CR0-CR7 + * REX.R + 20/0: mov r64, CR8 + */ + if (vie->paging.cpl != 0) { + vm_inject_gp(vm, vcpuid); + vie->num_processed = 0; + return (0); + } + err = vm_get_register(vm, vcpuid, cr, &val); + if (err != 0) { + /* #UD for access to non-existent CRs */ + vm_inject_ud(vm, vcpuid); + vie->num_processed = 0; + return (0); + } + err = vie_update_register(vm, vcpuid, gpr, val, size); + break; + case 0x22: { + /* + * MOV reg (ModRM:r/m) to control register (ModRM:reg) + * 22/r: mov CR0-CR7, r32 + * 22/r: mov CR0-CR7, r64 + * REX.R + 22/0: mov CR8, r64 + */ + uint64_t old, diff; + + if (vie->paging.cpl != 0) { + vm_inject_gp(vm, vcpuid); + vie->num_processed = 0; + return (0); + } + err = vm_get_register(vm, vcpuid, cr, &old); + if (err != 0) { + /* #UD for access to non-existent CRs */ + vm_inject_ud(vm, vcpuid); + vie->num_processed = 0; + return (0); + } + err = vm_get_register(vm, vcpuid, gpr, &val); + VERIFY0(err); + val &= size2mask[size]; + diff = old ^ val; + + switch (cr) { + case VM_REG_GUEST_CR0: + if ((diff & CR0_PG) != 0) { + uint64_t efer; + + err = vm_get_register(vm, vcpuid, + VM_REG_GUEST_EFER, &efer); + VERIFY0(err); + + /* Keep the long-mode state in EFER in sync */ + if ((val & CR0_PG) != 0 && + (efer & EFER_LME) != 0) { + efer |= EFER_LMA; + } + if ((val & CR0_PG) == 0 && + (efer & EFER_LME) != 0) { + efer &= ~EFER_LMA; + } + + err = vm_set_register(vm, vcpuid, + VM_REG_GUEST_EFER, efer); + VERIFY0(err); + } + /* TODO: enforce more of the #GP checks */ + err = vm_set_register(vm, vcpuid, cr, val); + VERIFY0(err); + break; + case VM_REG_GUEST_CR2: + case VM_REG_GUEST_CR3: + case VM_REG_GUEST_CR4: + /* TODO: enforce more of the #GP checks */ + err = vm_set_register(vm, vcpuid, cr, val); + break; + default: + /* The cr_map mapping should prevent this */ + panic("invalid cr %d", cr); + } + break; + } + default: + return (EINVAL); + } + return (err); +} + +static int +vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vie, vm, vcpuid, &byte); + if (error == 0) { + error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, + size); + } + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); + if (error == 0) + error = vie_write_bytereg(vie, vm, vcpuid, val); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + val = vie->immediate; + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate & size2mask[size]; + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); + break; + default: + break; + } + + return (error); +} + +static int +vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (int8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + */ +static int +vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize, + int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, + uint64_t *gla) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + struct vm_guest_paging *paging; + + paging = &vie->paging; + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vm_get_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (-1); + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + return (-1); + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (-1); + } + + return (0); +} + +static int +vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + struct vm_copyinfo copyinfo[2]; + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, fault, opsize, seg, repeat; + struct vm_guest_paging *paging; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + paging = &vie->paging; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, + VM_REG_GUEST_RSI, &srcaddr) != 0) { + goto done; + } + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); + if (error) + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, + &dstaddr) != 0) { + goto done; + } + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, + opsize); + + if (error == 0) { + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + } + /* + * Regardless of whether the MMIO read was successful or + * not, the copy resources must be cleaned up. + */ + vm_copy_teardown(vm, vcpuid, copyinfo, + nitems(copyinfo)); + if (error != 0) { + goto done; + } + } else { + /* + * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + PROT_READ, &srcgpa, &fault); + if (error || fault) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, + opsize); + if (error) + goto done; + + error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, + opsize); + if (error) + goto done; + } + } + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + return (vie_repeat(vie)); + } +done: + return (error); +} + +static int +vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); + if (error) + return (error); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + return (vie_repeat(vie)); + } + + return (0); +} + +static int +vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); + break; + default: + break; + } + if (error) + return (error); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x0B: + /* + * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 0b/r or r16, r/m16 + * 0b/r or r32, r/m32 + * REX.W + 0b/r or r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 | val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | vie->immediate; + error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); + break; + default: + break; + } + if (error) + return (error); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + uint64_t regop, memop, op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x39: + case 0x3B: + /* + * 39/r CMP r/m16, r16 + * 39/r CMP r/m32, r32 + * REX.W 39/r CMP r/m64, r64 + * + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare the first operand with the second operand and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the register operand */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, ®op); + if (error) + return (error); + + /* Get the memory operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); + if (error) + return (error); + + if (vie->op.op_byte == 0x3B) { + op1 = regop; + op2 = memop; + } else { + op1 = memop; + op2 = regop; + } + rflags2 = getcc(size, op1, op2); + break; + case 0x80: + case 0x81: + case 0x83: + /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + if (vie->op.op_byte == 0x80) + size = 1; + + /* get the first operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + uint64_t op1, rflags, rflags2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xF6: + /* + * F6 /0 test r/m8, imm8 + * + * Test mem (ModRM:r/m) with immediate and set status + * flags according to the results. The comparison is + * performed by anding the immediate from the first + * operand and then setting the status flags. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + size = 1; /* override for byte operation */ + + error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); + if (error) + return (error); + + rflags2 = getandflags(size, op1, vie->immediate); + break; + case 0xF7: + /* + * F7 /0 test r/m16, imm16 + * F7 /0 test r/m32, imm32 + * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 + * + * Test mem (ModRM:r/m) with immediate and set status + * flags according to the results. The comparison is + * performed by anding the immediate from the first + * operand and then setting the status flags. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); + if (error) + return (error); + + rflags2 = getandflags(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + */ + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + uint64_t src1, src2, dst, rflags; + unsigned start, len; + int error, size; + struct vm_guest_paging *paging; + + size = vie->opsize; + error = EINVAL; + paging = &vie->paging; + + /* + * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b + * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b + * + * Destination operand is ModRM:reg. Source operands are ModRM:r/m and + * Vex.vvvv. + * + * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). + */ + if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) + size = 4; + + /* + * Extracts contiguous bits from the first /source/ operand (second + * operand) using an index and length specified in the second /source/ + * operand (third operand). + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); + if (error) + return (error); + error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); + if (error) + return (error); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + start = (src2 & 0xff); + len = (src2 & 0xff00) >> 8; + + /* If no bits are extracted, the destination register is cleared. */ + dst = 0; + + /* If START exceeds the operand size, no bits are extracted. */ + if (start > size * 8) + goto done; + /* Length is bounded by both the destination size and start offset. */ + if (start + len > size * 8) + len = (size * 8) - start; + if (len == 0) + goto done; + + if (start > 0) + src1 = (src1 >> start); + if (len < 64) + src1 = src1 & ((1ull << len) - 1); + dst = src1; + +done: + error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); + if (error) + return (error); + + /* + * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. + * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. + */ + rflags &= ~RFLAGS_STATUS_BITS; + if (dst == 0) + rflags |= PSL_Z; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, + 8); + return (error); +} + +static int +vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x03: + /* + * ADD r/m to r and store the result in r + * + * 03/r ADD r16, r/m16 + * 03/r ADD r32, r/m32 + * REX.W + 03/r ADD r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 + val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getaddflags(size, val1, val2); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vm_get_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + struct vm_copyinfo copyinfo[2]; + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, fault, size, stackaddrsize, pushop; + struct vm_guest_paging *paging; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + paging = &vie->paging; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compatibility mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= size; + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, + &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); + + if (pushop) { + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); + if (error == 0) + vm_copyout(vm, vcpuid, &val, copyinfo, size); + } else { + vm_copyin(vm, vcpuid, copyinfo, &val, size); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); + rsp += size; + } + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); + return (error); +} + +static int +vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); + return (error); +} + +static int +vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = vie_emulate_or(vie, vm, vcpuid, gpa); + break; + case 0x4: /* AND */ + error = vie_emulate_and(vie, vm, vcpuid, gpa); + break; + case 0x7: /* CMP */ + error = vie_emulate_cmp(vie, vm, vcpuid, gpa); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +static int +vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid, + uint64_t gpa) +{ + int error; + uint64_t buf; + + switch (vie->reg & 7) { + case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ + if (vie->mod == 0x3) { + /* + * SFENCE. Ignore it, VM exit provides enough + * barriers on its own. + */ + error = 0; + } else { + /* + * CLFLUSH, CLFLUSHOPT. Only check for access + * rights. + */ + error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid) +{ + uint64_t val; + int error; + + if (vie->paging.cpl != 0) { + vm_inject_gp(vm, vcpuid); + vie->num_processed = 0; + return (0); + } + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val); + ASSERT(error == 0); + + /* Clear %cr0.TS */ + val &= ~CR0_TS; + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val); + ASSERT(error == 0); + + return (0); +} + +static int +vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int bytes) +{ + int err; + + if (vie->mmio_req_read.state == VR_DONE) { + ASSERT(vie->mmio_req_read.bytes == bytes); + ASSERT(vie->mmio_req_read.gpa == gpa); + + *rval = vie->mmio_req_read.data; + return (0); + } + + err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); + if (err == 0) { + /* + * A successful read from an in-kernel-emulated device may come + * with side effects, so stash the result in case it's used for + * an instruction which subsequently needs to issue an MMIO + * write to userspace. + */ + ASSERT(vie->mmio_req_read.state == VR_NONE); + + vie->mmio_req_read.bytes = bytes; + vie->mmio_req_read.gpa = gpa; + vie->mmio_req_read.data = *rval; + vie->mmio_req_read.state = VR_DONE; + + } else if (err == ESRCH) { + /* Hope that userspace emulation can fulfill this read */ + vie->mmio_req_read.bytes = bytes; + vie->mmio_req_read.gpa = gpa; + vie->mmio_req_read.state = VR_PENDING; + vie->status |= VIES_PENDING_MMIO; + } else if (err < 0) { + /* + * The MMIO read failed in such a way that fallback to handling + * in userspace is required. + */ + vie->status |= VIES_USER_FALLBACK; + } + return (err); +} + +static int +vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, + uint64_t wval, int bytes) +{ + int err; + + if (vie->mmio_req_write.state == VR_DONE) { + ASSERT(vie->mmio_req_write.bytes == bytes); + ASSERT(vie->mmio_req_write.gpa == gpa); + + return (0); + } + + err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); + if (err == 0) { + /* + * A successful write to an in-kernel-emulated device probably + * results in side effects, so stash the fact that such a write + * succeeded in case the operation requires other work. + */ + vie->mmio_req_write.bytes = bytes; + vie->mmio_req_write.gpa = gpa; + vie->mmio_req_write.data = wval; + vie->mmio_req_write.state = VR_DONE; + } else if (err == ESRCH) { + /* Hope that userspace emulation can fulfill this write */ + vie->mmio_req_write.bytes = bytes; + vie->mmio_req_write.gpa = gpa; + vie->mmio_req_write.data = wval; + vie->mmio_req_write.state = VR_PENDING; + vie->status |= VIES_PENDING_MMIO; + } else if (err < 0) { + /* + * The MMIO write failed in such a way that fallback to handling + * in userspace is required. + */ + vie->status |= VIES_USER_FALLBACK; + } + return (err); +} + +int +vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid) +{ + int error; + uint64_t gpa; + + if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != + (VIES_INST_DECODE | VIES_MMIO)) { + return (EINVAL); + } + + gpa = vie->mmio_gpa; + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = vie_emulate_group1(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_POP: + error = vie_emulate_pop(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_PUSH: + error = vie_emulate_push(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_CMP: + error = vie_emulate_cmp(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_MOV: + error = vie_emulate_mov(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = vie_emulate_movx(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_MOVS: + error = vie_emulate_movs(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_STOS: + error = vie_emulate_stos(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_AND: + error = vie_emulate_and(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_OR: + error = vie_emulate_or(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_SUB: + error = vie_emulate_sub(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_BITTEST: + error = vie_emulate_bittest(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_TWOB_GRP15: + error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_ADD: + error = vie_emulate_add(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_TEST: + error = vie_emulate_test(vie, vm, vcpuid, gpa); + break; + case VIE_OP_TYPE_BEXTR: + error = vie_emulate_bextr(vie, vm, vcpuid, gpa); + break; + default: + error = EINVAL; + break; + } + + if (error == ESRCH) { + /* Return to userspace with the mmio request */ + return (-1); + } + + return (error); +} + +static int +vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid, + uint32_t *eax) +{ + uint32_t mask, val; + bool in; + int err; + + mask = vie_size2mask(vie->inout.bytes); + in = (vie->inout.flags & INOUT_IN) != 0; + + if (!in) { + val = *eax & mask; + } + + if (vie->inout_req_state != VR_DONE) { + err = vm_ioport_access(vm, vcpuid, in, vie->inout.port, + vie->inout.bytes, &val); + val &= mask; + } else { + /* + * This port access was handled in userspace and the result was + * injected in to be handled now. + */ + val = vie->inout_req_val & mask; + vie->inout_req_state = VR_NONE; + err = 0; + } + + if (err == ESRCH) { + vie->status |= VIES_PENDING_INOUT; + vie->inout_req_state = VR_PENDING; + return (err); + } else if (err != 0) { + return (err); + } + + if (in) { + *eax = (*eax & ~mask) | val; + } + return (0); +} + +static enum vm_reg_name +vie_inout_segname(const struct vie *vie) +{ + uint8_t segidx = vie->inout.segment; + const enum vm_reg_name segmap[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); + + if (segidx >= maxidx) { + panic("unexpected segment index %u", segidx); + } + return (segmap[segidx]); +} + +static int +vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) +{ + uint8_t bytes, addrsize; + uint64_t index, count = 0, gla, rflags; + int prot, err, fault; + bool in, repeat; + enum vm_reg_name seg_reg, idx_reg; + struct vm_copyinfo copyinfo[2]; + + in = (vie->inout.flags & INOUT_IN) != 0; + bytes = vie->inout.bytes; + addrsize = vie->inout.addrsize; + prot = in ? PROT_WRITE : PROT_READ; + + ASSERT(bytes == 1 || bytes == 2 || bytes == 4); + ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); + + idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + seg_reg = vie_inout_segname(vie); + err = vm_get_register(vm, vcpuid, idx_reg, &index); + ASSERT(err == 0); + index = index & vie_size2mask(addrsize); + + repeat = (vie->inout.flags & INOUT_REP) != 0; + + /* Count register */ + if (repeat) { + err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); + count &= vie_size2mask(addrsize); + + if (count == 0) { + /* + * If we were asked to emulate a REP INS/OUTS when the + * count register is zero, no further work is required. + */ + return (0); + } + } else { + count = 1; + } + + gla = 0; + if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, + idx_reg, &gla) != 0) { + /* vie_get_gla() already injected the appropriate fault */ + return (0); + } + + /* + * The INS/OUTS emulate currently assumes that the memory target resides + * within the guest system memory, rather than a device MMIO region. If + * such a case becomes a necessity, that additional handling could be + * put in place. + */ + err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, + copyinfo, nitems(copyinfo), &fault); + + if (err) { + /* Unrecoverable error */ + return (err); + } else if (fault) { + /* Resume guest to handle fault */ + return (0); + } + + if (!in) { + vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); + } + + err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); + + if (err == 0 && in) { + vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); + } + + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (err == 0) { + err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + ASSERT(err == 0); + + /* Update index */ + if (rflags & PSL_D) { + index -= bytes; + } else { + index += bytes; + } + + /* Update index register */ + err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); + ASSERT(err == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if ((vie->inout.flags & INOUT_REP) != 0) { + count--; + err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + count, addrsize); + ASSERT(err == 0); + + if (count != 0) { + return (vie_repeat(vie)); + } + } + } + + return (err); +} + +int +vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) +{ + int err = 0; + + if ((vie->status & VIES_INOUT) == 0) { + return (EINVAL); + } + + if ((vie->inout.flags & INOUT_STR) == 0) { + /* + * For now, using the 'rep' prefixes with plain (non-string) + * in/out is not supported. + */ + if ((vie->inout.flags & INOUT_REP) != 0) { + return (EINVAL); + } + + err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); + if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) { + /* + * With the inX access now a success, the result needs + * to be stored in the guest %rax. + */ + err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + vie->inout.eax); + VERIFY0(err); + } + } else { + vie->status &= ~VIES_REPEAT; + err = vie_emulate_inout_str(vie, vm, vcpuid); + + } + if (err < 0) { + /* + * Access to an I/O port failed in such a way that fallback to + * handling in userspace is required. + */ + vie->status |= VIES_USER_FALLBACK; + } else if (err == ESRCH) { + ASSERT(vie->status & VIES_PENDING_INOUT); + /* Return to userspace with the in/out request */ + err = -1; + } + + return (err); +} + +int +vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid) +{ + int error; + + if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) != + (VIES_INST_DECODE | VIES_OTHER)) { + return (EINVAL); + } + + switch (vie->op.op_type) { + case VIE_OP_TYPE_CLTS: + error = vie_emulate_clts(vie, vm, vcpuid); + break; + case VIE_OP_TYPE_MOV_CR: + error = vie_emulate_mov_cr(vie, vm, vcpuid); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +void +vie_reset(struct vie *vie) +{ + vie->status = 0; + vie->num_processed = vie->num_valid = 0; +} + +void +vie_advance_pc(struct vie *vie, uint64_t *nextrip) +{ + VERIFY((vie->status & VIES_REPEAT) == 0); + + *nextrip += vie->num_processed; + vie_reset(vie); +} + +void +vie_exitinfo(const struct vie *vie, struct vm_exit *vme) +{ + if (vie->status & VIES_USER_FALLBACK) { + /* + * Despite the fact that the instruction was successfully + * decoded, some aspect of the emulation failed in such a way + * that it is left up to userspace to complete the operation. + */ + vie_fallback_exitinfo(vie, vme); + } else if (vie->status & VIES_MMIO) { + vme->exitcode = VM_EXITCODE_MMIO; + if (vie->mmio_req_read.state == VR_PENDING) { + vme->u.mmio.gpa = vie->mmio_req_read.gpa; + vme->u.mmio.data = 0; + vme->u.mmio.bytes = vie->mmio_req_read.bytes; + vme->u.mmio.read = 1; + } else if (vie->mmio_req_write.state == VR_PENDING) { + vme->u.mmio.gpa = vie->mmio_req_write.gpa; + vme->u.mmio.data = vie->mmio_req_write.data & + vie_size2mask(vie->mmio_req_write.bytes); + vme->u.mmio.bytes = vie->mmio_req_write.bytes; + vme->u.mmio.read = 0; + } else { + panic("bad pending MMIO state"); + } + } else if (vie->status & VIES_INOUT) { + vme->exitcode = VM_EXITCODE_INOUT; + vme->u.inout.port = vie->inout.port; + vme->u.inout.bytes = vie->inout.bytes; + if ((vie->inout.flags & INOUT_IN) != 0) { + vme->u.inout.flags = INOUT_IN; + vme->u.inout.eax = 0; + } else { + vme->u.inout.flags = 0; + vme->u.inout.eax = vie->inout.eax & + vie_size2mask(vie->inout.bytes); + } + } else { + panic("no pending operation"); + } +} + +/* + * In the case of a decoding or verification failure, bailing out to userspace + * to do the instruction emulation is our only option for now. + */ +void +vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) +{ + if ((vie->status & VIES_INST_FETCH) == 0) { + bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); + } else { + ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); + + bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); + vme->u.inst_emul.num_valid = vie->num_valid; + } + vme->exitcode = VM_EXITCODE_INST_EMUL; +} + +void +vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base, + int *cs_d) +{ + struct seg_desc cs_desc; + int error; + + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); + ASSERT(error == 0); + + /* Initialization required for the paging info to be populated */ + VERIFY(vie->status & VIES_INIT); + switch (vie->paging.cpu_mode) { + case CPU_MODE_REAL: + *cs_base = cs_desc.base; + *cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + *cs_base = cs_desc.base; + *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0; + break; + default: + *cs_base = 0; + *cs_d = 0; + break; + } +} + +bool +vie_pending(const struct vie *vie) +{ + /* + * These VIE status bits indicate conditions which must be addressed + * through either device IO fulfillment (with corresponding + * vie_fulfill_*()) or complete userspace emulation (followed by a + * vie_reset()). + */ + const enum vie_status of_interest = + VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK; + + return ((vie->status & of_interest) != 0); +} + +bool +vie_needs_fetch(const struct vie *vie) +{ + if (vie->status & VIES_INST_FETCH) { + ASSERT(vie->num_valid != 0); + return (false); + } + return (true); +} + +static int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +static int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +static uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +static int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %x", __func__, prot)); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %x", seg, desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %x", seg, type)); + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +void +vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, + const struct vm_guest_paging *paging, uint64_t gpa) +{ + KASSERT(inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + bzero(vie, sizeof (struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + vie->status = VIES_INIT | VIES_MMIO; + + if (inst_length != 0) { + bcopy(inst_bytes, vie->inst, inst_length); + vie->num_valid = inst_length; + vie->status |= VIES_INST_FETCH; + } + + vie->paging = *paging; + vie->mmio_gpa = gpa; +} + +void +vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, + const struct vm_guest_paging *paging) +{ + bzero(vie, sizeof (struct vie)); + + vie->status = VIES_INIT | VIES_INOUT; + + vie->inout = *inout; + vie->paging = *paging; + + /* + * Since VMX/SVM assists already decoded the nature of the in/out + * instruction, let the status reflect that. + */ + vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; + vie->num_processed = inst_len; +} + +void +vie_init_other(struct vie *vie, const struct vm_guest_paging *paging) +{ + bzero(vie, sizeof (struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + vie->status = VIES_INIT | VIES_OTHER; + + vie->paging = *paging; +} + +int +vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) +{ + struct vie_mmio *pending; + + if ((vie->status & VIES_MMIO) == 0 || + (vie->status & VIES_PENDING_MMIO) == 0) { + return (EINVAL); + } + + if (result->read) { + pending = &vie->mmio_req_read; + } else { + pending = &vie->mmio_req_write; + } + + if (pending->state != VR_PENDING || + pending->bytes != result->bytes || pending->gpa != result->gpa) { + return (EINVAL); + } + + if (result->read) { + pending->data = result->data & vie_size2mask(pending->bytes); + } + pending->state = VR_DONE; + vie->status &= ~VIES_PENDING_MMIO; + + return (0); +} + +int +vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) +{ + if ((vie->status & VIES_INOUT) == 0 || + (vie->status & VIES_PENDING_INOUT) == 0) { + return (EINVAL); + } + if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || + vie->inout.bytes != result->bytes || + vie->inout.port != result->port) { + return (EINVAL); + } + + if (result->flags & INOUT_IN) { + vie->inout_req_val = result->eax & + vie_size2mask(vie->inout.bytes); + } + vie->inout_req_state = VR_DONE; + vie->status &= ~(VIES_PENDING_INOUT); + + return (0); +} + +uint64_t +vie_mmio_gpa(const struct vie *vie) +{ + return (vie->mmio_gpa); +} + +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & PROT_EXEC) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(vm_page_t **vmp) +{ + if (*vmp != NULL) { + vmp_release(*vmp); + *vmp = NULL; + } +} + +static void * +ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) +{ + vm_client_t *vmc = vm_get_vmclient(vm, vcpu); + const uintptr_t hold_gpa = gpa & PAGEMASK; + + /* Hold must not cross a page boundary */ + VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); + + if (*vmp != NULL) { + vmp_release(*vmp); + } + + *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); + if (*vmp == NULL) { + return (NULL); + } + + return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); +} + +static int +_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) +{ + int nlevels, pfcode; + int ptpshift = 0, ptpindex = 0; + uint64_t ptpphys; + uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; + vm_page_t *cookie = NULL; + const bool usermode = paging->cpl == 3; + const bool writable = (prot & PROT_WRITE) != 0; + + *guest_fault = 0; +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + if (!check_only) + vm_inject_gp(vm, vcpuid); + *guest_fault = 1; + return (0); + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + return (0); + } + + if (paging->paging_mode == PAGING_MODE_32) { + uint32_t *ptpbase32, pte32; + + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~0xfff; + + ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, + &cookie); + + if (ptpbase32 == NULL) { + return (EFAULT); + } + + ptpshift = PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, + 0, pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + + ptp_release(&cookie); + *guest_fault = 1; + return (0); + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if (!check_only && (pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + ptp_release(&cookie); + return (0); + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, + &cookie); + if (ptpbase == NULL) { + return (EFAULT); + } + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + + ptp_release(&cookie); + *guest_fault = 1; + return (0); + } + + ptpphys = pte; + + nlevels = 2; + } else { + nlevels = 4; + } + + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys &= 0x000ffffffffff000UL; + + ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase == NULL) { + return (EFAULT); + } + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + + ptp_release(&cookie); + *guest_fault = 1; + return (0); + } + + /* Set the accessed bit in the page table entry */ + if (!check_only && (pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, + 1, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } + + ptp_release(&cookie); + *guest_fault = 1; + return (0); + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + ptp_release(&cookie); + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); + return (0); +} + +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + false)); +} + +int +vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + true)); +} + +int +vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, + int *faultptr) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if ((vie->status & VIES_INIT) == 0) { + return (EINVAL); + } + + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, + prot, copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = VIE_INST_SIZE; + vie->status |= VIES_INST_FETCH; + return (0); +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * § 2.3.5, "The VEX Prefix", SDM Vol 2. + */ + if ((cpu_mode == CPU_MODE_64BIT || + cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { + const struct vie_op *optab; + + /* 3-byte VEX prefix. */ + vie->vex_present = 1; + + vie_advance(vie); + if (vie_peek(vie, &x)) + return (-1); + + /* + * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted + * relative to REX encoding. + */ + vie->rex_r = x & 0x80 ? 0 : 1; + vie->rex_x = x & 0x40 ? 0 : 1; + vie->rex_b = x & 0x20 ? 0 : 1; + + switch (x & 0x1F) { + case 0x2: + /* 0F 38. */ + optab = three_byte_opcodes_0f38; + break; + case 0x1: + /* 0F class - nothing handled here yet. */ + /* FALLTHROUGH */ + case 0x3: + /* 0F 3A class - nothing handled here yet. */ + /* FALLTHROUGH */ + default: + /* Reserved (#UD). */ + return (-1); + } + + vie_advance(vie); + if (vie_peek(vie, &x)) + return (-1); + + /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ + vie->rex_w = x & 0x80 ? 1 : 0; + + vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); + vie->vex_l = !!(x & 0x4); + vie->vex_pp = (x & 0x3); + + /* PP: 1=66 2=F3 3=F2 prefixes. */ + switch (vie->vex_pp) { + case 0x1: + vie->opsize_override = 1; + break; + case 0x2: + vie->repz_present = 1; + break; + case 0x3: + vie->repnz_present = 1; + break; + } + + vie_advance(vie); + + /* Opcode, sans literal prefix prefix. */ + if (vie_peek(vie, &x)) + return (-1); + + vie->op = optab[x]; + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + /* Already did this via VEX prefix. */ + if (vie->op.op_type != VIE_OP_TYPE_NONE) + return (0); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + /* + * Handling mov-to/from-cr is special since it is not issuing + * mmio/pio requests and can be done in real mode. We must bypass some + * of the other existing decoding restrictions for it. + */ + const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0); + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL && !is_movcr) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT && !is_movcr) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; + return (0); +} + +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +int +vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) +{ + int error; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; + + ASSERT((vie->status & VIES_INST_DECODE) != 0); + + /* + * If there was no valid GLA context with the exit, or the decoded + * instruction acts on more than one address, verification is done. + */ + if (gla == VIE_INVALID_GLA || + (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { + return (0); + } + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_processed; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) { + seg = vie->segment_register; + } else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) { + seg = VM_REG_GUEST_SS; + } else { + seg = VM_REG_GUEST_DS; + } + if (vie->paging.cpu_mode == CPU_MODE_64BIT && + seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vm, cpuid, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: segbase(0x%0lx)" + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); + return (-1); + } + + return (0); +} + +int +vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) +{ + enum vm_cpu_mode cpu_mode; + + if ((vie->status & VIES_INST_FETCH) == 0) { + return (EINVAL); + } + + cpu_mode = vie->paging.cpu_mode; + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + + vie->status |= VIES_INST_DECODE; + + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_ioport.c b/usr/src/uts/intel/io/vmm/vmm_ioport.c new file mode 100644 index 0000000000..3826dbe8b5 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_ioport.c @@ -0,0 +1,297 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vatpic.h" +#include "vatpit.h" +#include "vrtc.h" +#include "vmm_ioport.h" + +/* Arbitrary limit on entries per VM */ +static uint_t ioport_entry_limit = 64; + +static void +vm_inout_def(ioport_entry_t *entries, uint_t i, uint16_t port, + ioport_handler_t func, void *arg, uint16_t flags) +{ + ioport_entry_t *ent = &entries[i]; + + if (i != 0) { + const ioport_entry_t *prev = &entries[i - 1]; + /* ensure that entries are inserted in sorted order */ + VERIFY(prev->iope_port < port); + } + ent->iope_func = func; + ent->iope_arg = arg; + ent->iope_port = port; + ent->iope_flags = flags; +} + +void +vm_inout_init(struct vm *vm, struct ioport_config *cfg) +{ + struct vatpit *pit = vm_atpit(vm); + struct vatpic *pic = vm_atpic(vm); + struct vrtc *rtc = vm_rtc(vm); + const uint_t ndefault = 13; + const uint16_t flag = IOPF_FIXED; + ioport_entry_t *ents; + uint_t i = 0; + + VERIFY0(cfg->iop_entries); + VERIFY0(cfg->iop_count); + + ents = kmem_zalloc(ndefault * sizeof (ioport_entry_t), KM_SLEEP); + + /* PIC (master): 0x20-0x21 */ + vm_inout_def(ents, i++, IO_ICU1, vatpic_master_handler, pic, flag); + vm_inout_def(ents, i++, IO_ICU1 + ICU_IMR_OFFSET, vatpic_master_handler, + pic, flag); + + /* PIT: 0x40-0x43 and 0x61 (ps2 tie-in) */ + vm_inout_def(ents, i++, TIMER_CNTR0, vatpit_handler, pit, flag); + vm_inout_def(ents, i++, TIMER_CNTR1, vatpit_handler, pit, flag); + vm_inout_def(ents, i++, TIMER_CNTR2, vatpit_handler, pit, flag); + vm_inout_def(ents, i++, TIMER_MODE, vatpit_handler, pit, flag); + vm_inout_def(ents, i++, NMISC_PORT, vatpit_nmisc_handler, pit, flag); + + /* RTC: 0x70-0x71 */ + vm_inout_def(ents, i++, IO_RTC, vrtc_addr_handler, rtc, flag); + vm_inout_def(ents, i++, IO_RTC + 1, vrtc_data_handler, rtc, flag); + + /* PIC (slave): 0xa0-0xa1 */ + vm_inout_def(ents, i++, IO_ICU2, vatpic_slave_handler, pic, flag); + vm_inout_def(ents, i++, IO_ICU2 + ICU_IMR_OFFSET, vatpic_slave_handler, + pic, flag); + + /* PIC (ELCR): 0x4d0-0x4d1 */ + vm_inout_def(ents, i++, IO_ELCR1, vatpic_elc_handler, pic, flag); + vm_inout_def(ents, i++, IO_ELCR2, vatpic_elc_handler, pic, flag); + + VERIFY3U(i, ==, ndefault); + cfg->iop_entries = ents; + cfg->iop_count = ndefault; +} + +void +vm_inout_cleanup(struct vm *vm, struct ioport_config *cfg) +{ + VERIFY(cfg->iop_entries); + VERIFY(cfg->iop_count); + + kmem_free(cfg->iop_entries, + sizeof (ioport_entry_t) * cfg->iop_count); + cfg->iop_entries = NULL; + cfg->iop_count = 0; +} + +static void +vm_inout_remove_at(uint_t idx, uint_t old_count, ioport_entry_t *old_ents, + ioport_entry_t *new_ents) +{ + uint_t new_count = old_count - 1; + + VERIFY(old_count != 0); + VERIFY(idx < old_count); + + /* copy entries preceeding to-be-removed index */ + if (idx > 0) { + bcopy(old_ents, new_ents, sizeof (ioport_entry_t) * idx); + } + /* copy entries following to-be-removed index */ + if (idx < new_count) { + bcopy(&old_ents[idx + 1], &new_ents[idx], + sizeof (ioport_entry_t) * (new_count - idx)); + } +} + +static void +vm_inout_insert_space_at(uint_t idx, uint_t old_count, ioport_entry_t *old_ents, + ioport_entry_t *new_ents) +{ + uint_t new_count = old_count + 1; + + VERIFY(idx < new_count); + + /* copy entries preceeding index where space is to be added */ + if (idx > 0) { + bcopy(old_ents, new_ents, sizeof (ioport_entry_t) * idx); + } + /* copy entries to follow added space */ + if (idx < new_count) { + bcopy(&old_ents[idx], &new_ents[idx + 1], + sizeof (ioport_entry_t) * (old_count - idx)); + } +} + +int +vm_inout_attach(struct ioport_config *cfg, uint16_t port, uint16_t flags, + ioport_handler_t func, void *arg) +{ + uint_t i, old_count, insert_idx; + ioport_entry_t *old_ents; + + if (cfg->iop_count >= ioport_entry_limit) { + return (ENOSPC); + } + + old_count = cfg->iop_count; + old_ents = cfg->iop_entries; + for (insert_idx = i = 0; i < old_count; i++) { + const ioport_entry_t *compare = &old_ents[i]; + if (compare->iope_port == port) { + return (EEXIST); + } else if (compare->iope_port < port) { + insert_idx = i + 1; + } + } + + + ioport_entry_t *new_ents; + uint_t new_count = old_count + 1; + new_ents = kmem_alloc(new_count * sizeof (ioport_entry_t), KM_SLEEP); + vm_inout_insert_space_at(insert_idx, old_count, old_ents, new_ents); + + new_ents[insert_idx].iope_func = func; + new_ents[insert_idx].iope_arg = arg; + new_ents[insert_idx].iope_port = port; + new_ents[insert_idx].iope_flags = flags; + new_ents[insert_idx].iope_pad = 0; + + cfg->iop_entries = new_ents; + cfg->iop_count = new_count; + kmem_free(old_ents, old_count * sizeof (ioport_entry_t)); + + return (0); +} + +int +vm_inout_detach(struct ioport_config *cfg, uint16_t port, bool drv_hook, + ioport_handler_t *old_func, void **old_arg) +{ + uint_t i, old_count, remove_idx; + ioport_entry_t *old_ents; + + old_count = cfg->iop_count; + old_ents = cfg->iop_entries; + VERIFY(old_count > 1); + for (i = 0; i < old_count; i++) { + const ioport_entry_t *compare = &old_ents[i]; + if (compare->iope_port != port) { + continue; + } + /* fixed ports are not allowed to be detached at runtime */ + if ((compare->iope_flags & IOPF_FIXED) != 0) { + return (EPERM); + } + + /* + * Driver-attached and bhyve-internal ioport hooks can only be + * removed by the respective party which attached them. + */ + if (drv_hook && (compare->iope_flags & IOPF_DRV_HOOK) == 0) { + return (EPERM); + } else if (!drv_hook && + (compare->iope_flags & IOPF_DRV_HOOK) != 0) { + return (EPERM); + } + break; + } + if (i == old_count) { + return (ENOENT); + } + remove_idx = i; + + if (old_func != NULL) { + *old_func = cfg->iop_entries[remove_idx].iope_func; + } + if (old_arg != NULL) { + *old_arg = cfg->iop_entries[remove_idx].iope_arg; + } + + ioport_entry_t *new_ents; + uint_t new_count = old_count - 1; + new_ents = kmem_alloc(new_count * sizeof (ioport_entry_t), KM_SLEEP); + vm_inout_remove_at(remove_idx, old_count, old_ents, new_ents); + + cfg->iop_entries = new_ents; + cfg->iop_count = new_count; + kmem_free(old_ents, old_count * sizeof (ioport_entry_t)); + + return (0); +} + +static ioport_entry_t * +vm_inout_find(const struct ioport_config *cfg, uint16_t port) +{ + const uint_t count = cfg->iop_count; + ioport_entry_t *entries = cfg->iop_entries; + + for (uint_t i = 0; i < count; i++) { + if (entries[i].iope_port == port) { + return (&entries[i]); + } + } + return (NULL); +} + +int +vm_inout_access(struct ioport_config *cfg, bool in, uint16_t port, + uint8_t bytes, uint32_t *val) +{ + const ioport_entry_t *ent; + int err; + + ent = vm_inout_find(cfg, port); + if (ent == NULL) { + err = ESRCH; + } else { + err = ent->iope_func(ent->iope_arg, in, port, bytes, val); + } + + return (err); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_ioport.h b/usr/src/uts/intel/io/vmm/vmm_ioport.h new file mode 100644 index 0000000000..254ba002f2 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_ioport.h @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _VMM_IOPORT_H_ +#define _VMM_IOPORT_H_ + +#include <sys/vmm_kernel.h> + +struct ioport_entry { + ioport_handler_t iope_func; + void *iope_arg; + uint16_t iope_port; + uint16_t iope_flags; + uint32_t iope_pad; +}; +typedef struct ioport_entry ioport_entry_t; + +struct ioport_config { + struct ioport_entry *iop_entries; + uint_t iop_count; +}; + +#define IOPF_DEFAULT 0 +#define IOPF_FIXED (1 << 0) /* system device fixed in position */ +#define IOPF_DRV_HOOK (1 << 1) /* external driver hook */ + +void vm_inout_init(struct vm *vm, struct ioport_config *ports); +void vm_inout_cleanup(struct vm *vm, struct ioport_config *ports); + +int vm_inout_attach(struct ioport_config *ports, uint16_t port, uint16_t flags, + ioport_handler_t func, void *arg); +int vm_inout_detach(struct ioport_config *ports, uint16_t port, bool drv_hook, + ioport_handler_t *old_func, void **old_arg); + +int vm_inout_access(struct ioport_config *ports, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); + +/* + * Arbitrary cookie for io port hook: + * - top 48 bits: func address + arg + * - lower 16 bits: port + */ +#define IOP_GEN_COOKIE(func, arg, port) \ + ((uintptr_t)((((uintptr_t)(func) + (uintptr_t)(arg)) << 16) \ + | (uint16_t)(port))) +#define IOP_PORT_FROM_COOKIE(cookie) (uint16_t)(cookie) + +#endif /* _VMM_IOPORT_H_ */ diff --git a/usr/src/uts/intel/io/vmm/vmm_ktr.h b/usr/src/uts/intel/io/vmm/vmm_ktr.h new file mode 100644 index 0000000000..2e706ffc57 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_ktr.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ + CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ + CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ + CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ + CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ + CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ + CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ + CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ + CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ + CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ + CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/usr/src/uts/intel/io/vmm/vmm_lapic.c b/usr/src/uts/intel/io/vmm/vmm_lapic.c new file mode 100644 index 0000000000..8ef1c851d0 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_lapic.c @@ -0,0 +1,151 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuset.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +{ + struct vlapic *vlapic; + vcpu_notify_t notify; + + if (cpu < 0 || cpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + notify = vlapic_set_intr_ready(vlapic, vector, level); + vcpu_notify_event_type(vm, cpu, notify); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int error; + + if (cpu < -1 || cpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(cpu, &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(cpu, &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg params + * according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered MSI/MSI-X so + * ignore trigger level in 'msg'. + * + * Certain kinds of interrupt broadcasts (physical or logical-clustered + * for destination 0xff) are prohibited when the redirection hint bit is + * set for a given message. Those edge cases are ignored for now. + */ + dest = (addr >> 12) & 0xff; + phys = (addr & MSI_X86_ADDR_LOG) == 0; + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_lapic.h b/usr/src/uts/intel/io/vmm/vmm_lapic.h new file mode 100644 index 0000000000..037b15a342 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_lapic.h @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +bool lapic_msr(uint_t num); +int lapic_rdmsr(struct vm *vm, int cpu, uint_t msr, uint64_t *rval); +int lapic_wrmsr(struct vm *vm, int cpu, uint_t msr, uint64_t wval); + +int lapic_mmio_read(struct vm *vm, int cpu, uint64_t gpa, uint64_t *rval, + int size); +int lapic_mmio_write(struct vm *vm, int cpu, uint64_t gpa, uint64_t wval, + int size); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + +#endif diff --git a/usr/src/uts/intel/io/vmm/vmm_reservoir.c b/usr/src/uts/intel/io/vmm/vmm_reservoir.c new file mode 100644 index 0000000000..1bb64a4851 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_reservoir.c @@ -0,0 +1,820 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * VMM Memory Reservoir + * + * + * In order to make the allocation of large (multi-GiB) chunks of memory + * for bhyve VMs easier, we introduce the "VMM Reservoir", where system + * operators can set aside a substantial portion of system memory exclusively + * for VMs. This memory is unavailable for general use by the rest of the + * system. Rather than having to scour the freelist, reap kmem caches, or put + * pressure on the ARC, bhyve guest memory allocations can quickly determine if + * there is adequate reservoir memory available. Since the pages stored in the + * reservoir are pre-zeroed, it can be immediately used when allocated to a + * guest. When the memory is returned to the reservoir, it is zeroed once more + * to avoid leaking any sensitive data from that guest. + * + * + * Transient Allocations + * + * While the explicit reservoir model may work well for some applications, + * others may want a more traditional model, where pages for guest memory + * objects are allocated on demand, rather than from a pool set aside from the + * system. In this case, the allocation can be made in "transient" mode, where + * the memory is allocated normally, even if there is free capacity in the + * reservoir. When use of the transient allocation is complete (the guest is + * halted and destroyed), the pages will be freed back to the system, rather + * than added back to the reservoir. + * + * From an implementation standpoint, transient allocations follow the same + * code paths as ones using the reservoir normally. Those allocations have a + * tag which marks them as transient, and used/free size tallies are maintained + * separately for normal and transient operations. When performing a transient + * allocation, that amount of memory is immediately added to the reservoir , + * from which the allocation can be made. When freeing a transient allocation, + * a matching amount of memory is removed from the reservoir as part of the + * operation. This allows both allocation types to coexist without too much + * additional machinery. + * + * + * Administration + * + * Operators may increase, decrease, and query the the amount of memory + * allocated to the reservoir and from to VMs via ioctls against the vmmctl + * device. The total amount added to the reservoir is arbitrarily limited at + * this time by `vmmr_total_limit` which defaults to 80% of physmem. This is + * done to prevent the reservoir from inadvertently growing to a size where the + * system has inadequate memory to make forward progress. Memory may only be + * removed from the reservoir when it is free (not allocated by any guest VMs). + * + * + * Page Tracking + * + * The reservoir currently uses vnode association to keep track of pages under + * its control (either designated to the reservoir and free, or allocated to a + * guest VM object). This means using the existing VM system primitives for + * page_t instances being associated with a given (vnode, offset) tuple. It + * means that spans of pages, either free or allocated, need only to store a + * length (of the span) and an offset (into the vnode) in order to gain access + * to all of the underlying pages associated with that span. Associating the + * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be + * properly tracked as KAS pages, but be excluded from normal dumps (unless the + * operator has chosen to dump all of RAM). + */ + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/machparam.h> +#include <sys/kmem.h> +#include <sys/stddef.h> +#include <sys/null.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <vm/seg_kmem.h> +#include <vm/hat_i86.h> + +#include <sys/vmm_reservoir.h> +#include <sys/vmm_dev.h> + +static kmutex_t vmmr_lock; + +static size_t vmmr_free_sz; +static size_t vmmr_free_transient_sz; +static size_t vmmr_adding_sz; +static size_t vmmr_alloc_sz; +static size_t vmmr_alloc_transient_sz; +static size_t vmmr_empty_sz; + +static uintptr_t vmmr_empty_last; +/* Upper limit for the size (free + allocated) of the reservoir */ +static size_t vmmr_total_limit; + +/* VA range allocated from the VMM arena for the mappings */ +static uintptr_t vmmr_va; +static uintptr_t vmmr_va_sz; + +/* Pair of AVL trees to store set of spans ordered by addr and size */ +typedef struct vmmr_treepair { + avl_tree_t by_addr; + avl_tree_t by_size; +} vmmr_treepair_t; + +/* Spans of free memory in the reservoir */ +static vmmr_treepair_t vmmr_free_tp; + +/* Spans of empty (not backed by memory) space in the reservoir */ +static vmmr_treepair_t vmmr_empty_tp; + +/* Regions of memory allocated from the reservoir */ +static list_t vmmr_alloc_regions; + +struct vmmr_span { + uintptr_t vs_addr; + size_t vs_size; + avl_node_t vs_by_addr; + avl_node_t vs_by_size; + uintptr_t vs_region_addr; +}; +typedef struct vmmr_span vmmr_span_t; + +struct vmmr_region { + size_t vr_size; + avl_tree_t vr_spans; + list_node_t vr_node; + bool vr_transient; +}; + +static int +vmmr_cmp_addr(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_addr == sb->vs_addr) { + return (0); + } else if (sa->vs_addr < sb->vs_addr) { + return (-1); + } else { + return (1); + } +} + +static int +vmmr_cmp_size(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_size == sb->vs_size) { + /* + * Since discontiguous spans could have the same size in a + * by-size tree, differentiate them (as required by AVL) by + * address so they can safely coexist while remaining sorted. + */ + return (vmmr_cmp_addr(a, b)); + } else if (sa->vs_size < sb->vs_size) { + return (-1); + } else { + return (1); + } +} + +static int +vmmr_cmp_region_addr(const void *a, const void *b) +{ + const vmmr_span_t *sa = a; + const vmmr_span_t *sb = b; + + if (sa->vs_region_addr == sb->vs_region_addr) { + return (0); + } else if (sa->vs_region_addr < sb->vs_region_addr) { + return (-1); + } else { + return (1); + } +} + +static void +vmmr_tp_init(vmmr_treepair_t *tree) +{ + avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), + offsetof(vmmr_span_t, vs_by_addr)); + avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), + offsetof(vmmr_span_t, vs_by_size)); +} + +static void +vmmr_tp_destroy(vmmr_treepair_t *tree) +{ + void *vcp = NULL; + vmmr_span_t *span; + + while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { + /* Freeing spans will be done when tearing down by-size tree */ + } + while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { + kmem_free(span, sizeof (*span)); + } + avl_destroy(&tree->by_addr); + avl_destroy(&tree->by_size); +} + +/* + * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent + * span(s). Such concatenation could result in the `to_add` span being freed, + * so the caller cannot use it after this returns. + */ +static void +vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) +{ + avl_tree_t *by_addr = &tree->by_addr; + avl_tree_t *by_size = &tree->by_size; + vmmr_span_t *node; + avl_index_t where; + + /* This addr should not already exist in the treepair */ + node = avl_find(by_addr, to_add, &where); + ASSERT3P(node, ==, NULL); + + node = avl_nearest(by_addr, where, AVL_BEFORE); + if (node != NULL && + (node->vs_addr + node->vs_size) == to_add->vs_addr) { + /* concat with preceeding item */ + avl_remove(by_addr, node); + avl_remove(by_size, node); + node->vs_size += to_add->vs_size; + kmem_free(to_add, sizeof (*to_add)); + + /* + * Since this now-concatenated span could be adjacent one + * trailing it, fall through to perform that check. + */ + to_add = node; + } + + node = avl_nearest(by_addr, where, AVL_AFTER); + if (node != NULL && + (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { + /* concat with trailing item */ + avl_remove(by_addr, node); + avl_remove(by_size, node); + node->vs_addr = to_add->vs_addr; + node->vs_size += to_add->vs_size; + avl_add(by_addr, node); + avl_add(by_size, node); + + kmem_free(to_add, sizeof (*to_add)); + return; + } + + /* simply insert */ + avl_add(by_addr, to_add); + avl_add(by_size, to_add); +} + +/* + * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of + * the exact target size is not present, but a larger one is. May return a span + * with a size smaller than the target if splitting is not an option. + */ +static vmmr_span_t * +vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) +{ + avl_tree_t *by_addr = &tree->by_addr; + avl_tree_t *by_size = &tree->by_size; + vmmr_span_t *span; + avl_index_t where; + + ASSERT3U(target_sz, !=, 0); + ASSERT(!avl_is_empty(by_addr)); + ASSERT(!avl_is_empty(by_size)); + + vmmr_span_t search = { .vs_size = target_sz }; + span = avl_find(by_size, &search, &where); + if (span == NULL) { + /* Try for a larger span (instead of exact match) */ + span = avl_nearest(by_size, where, AVL_AFTER); + if (span == NULL) { + /* + * Caller will need to collect several smaller spans in + * order to fulfill their request. + */ + span = avl_nearest(by_size, where, AVL_BEFORE); + ASSERT3P(span, !=, NULL); + } + } + + if (span->vs_size <= target_sz) { + avl_remove(by_size, span); + avl_remove(by_addr, span); + + return (span); + } else { + /* Split off adequate chunk from larger span */ + uintptr_t start = span->vs_addr + span->vs_size - target_sz; + + avl_remove(by_size, span); + span->vs_size -= target_sz; + avl_add(by_size, span); + + vmmr_span_t *split_span = + kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); + split_span->vs_addr = start; + split_span->vs_size = target_sz; + + return (split_span); + } +} + +void +vmmr_init() +{ + mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * `vmm_total_limit` represents the absolute maximum size of the VMM + * memory reservoir. It is meant to provide some measure of protection + * against an operator pushing the system into unrecoverable memory + * starvation through explicit or transient additions to the reservoir. + * + * There will be many situations where this limit would be inadequate to + * prevent kernel memory starvation in the face of certain operator + * actions. It is a balance to be struck between safety and allowing + * large systems to reach high utilization. + * + * The value is based off of pages_pp_maximum: "Number of currently + * available pages that cannot be 'locked'". It is sized as all of + * `physmem` less 120% of `pages_pp_maximum`. + */ + vmmr_total_limit = + (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; + + vmmr_empty_last = 0; + vmmr_free_sz = 0; + vmmr_alloc_sz = 0; + vmmr_empty_sz = 0; + vmmr_adding_sz = 0; + vmmr_free_transient_sz = 0; + vmmr_alloc_transient_sz = 0; + + vmmr_tp_init(&vmmr_free_tp); + vmmr_tp_init(&vmmr_empty_tp); + + list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), + offsetof(vmmr_region_t, vr_node)); + + /* Grab a chunk of VA for the reservoir */ + vmmr_va_sz = physmem * PAGESIZE; + vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); +} + +void +vmmr_fini() +{ + mutex_enter(&vmmr_lock); + VERIFY3U(vmmr_alloc_sz, ==, 0); + VERIFY3U(vmmr_free_sz, ==, 0); + VERIFY3U(vmmr_adding_sz, ==, 0); + VERIFY3U(vmmr_alloc_transient_sz, ==, 0); + VERIFY3U(vmmr_free_transient_sz, ==, 0); + VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); + VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); + VERIFY(list_is_empty(&vmmr_alloc_regions)); + + vmmr_tp_destroy(&vmmr_free_tp); + vmmr_tp_destroy(&vmmr_empty_tp); + list_destroy(&vmmr_alloc_regions); + + /* Release reservoir VA chunk */ + vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); + vmmr_va = 0; + vmmr_va_sz = 0; + vmmr_total_limit = 0; + vmmr_empty_last = 0; + + mutex_exit(&vmmr_lock); + mutex_destroy(&vmmr_lock); +} + +bool +vmmr_is_empty() +{ + mutex_enter(&vmmr_lock); + bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && + vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); + mutex_exit(&vmmr_lock); + return (res); +} + +int +vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + if (!transient) { + mutex_enter(&vmmr_lock); + if (sz > vmmr_free_sz) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + } else { + int err; + + err = vmmr_add(sz, true); + if (err != 0) { + return (err); + } + mutex_enter(&vmmr_lock); + VERIFY3U(vmmr_free_transient_sz, >=, sz); + } + + vmmr_region_t *region; + region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); + avl_create(®ion->vr_spans, vmmr_cmp_region_addr, + sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); + region->vr_size = sz; + + size_t remain = sz; + uintptr_t map_at = 0; + while (remain > 0) { + vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); + + /* + * We have already ensured that adequate free memory is present + * in the reservoir for this allocation. + */ + VERIFY3P(span, !=, NULL); + ASSERT3U(span->vs_size, <=, remain); + + span->vs_region_addr = map_at; + avl_add(®ion->vr_spans, span); + map_at += span->vs_size; + remain -= span->vs_size; + } + + if (!transient) { + vmmr_free_sz -= sz; + vmmr_alloc_sz += sz; + } else { + vmmr_free_transient_sz -= sz; + vmmr_alloc_transient_sz += sz; + region->vr_transient = true; + } + list_insert_tail(&vmmr_alloc_regions, region); + mutex_exit(&vmmr_lock); + + *resp = region; + return (0); +} + +void * +vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) +{ + /* just use KPM region for now */ + return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); +} + +pfn_t +vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) +{ + VERIFY3U(off & PAGEOFFSET, ==, 0); + VERIFY3U(off, <, region->vr_size); + + vmmr_span_t search = { + .vs_region_addr = off + }; + avl_index_t where; + vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); + + if (span == NULL) { + span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); + ASSERT3P(span, !=, NULL); + } + uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; + page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); + VERIFY(pp != NULL); + return (pp->p_pagenum); +} + +void +vmmr_free(vmmr_region_t *region) +{ + mutex_enter(&vmmr_lock); + if (!region->vr_transient) { + VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); + } else { + VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); + } + list_remove(&vmmr_alloc_regions, region); + mutex_exit(&vmmr_lock); + + /* Zero the contents */ + for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { + bzero(vmmr_region_mem_at(region, off), PAGESIZE); + } + + mutex_enter(&vmmr_lock); + + /* Put the contained span(s) back in the free pool */ + void *cookie = NULL; + vmmr_span_t *span; + while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { + span->vs_region_addr = 0; + vmmr_tp_insert_concat(span, &vmmr_free_tp); + } + avl_destroy(®ion->vr_spans); + if (!region->vr_transient) { + vmmr_free_sz += region->vr_size; + vmmr_alloc_sz -= region->vr_size; + } else { + vmmr_free_transient_sz += region->vr_size; + vmmr_alloc_transient_sz -= region->vr_size; + } + mutex_exit(&vmmr_lock); + + if (region->vr_transient) { + vmmr_remove(region->vr_size, true); + } + kmem_free(region, sizeof (*region)); +} + +static void +vmmr_destroy_pages(vmmr_span_t *span) +{ + const uintptr_t end = span->vs_addr + span->vs_size; + struct vnode *vp = &kvps[KV_VVP]; + for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { + page_t *pp; + + /* Page-free logic cribbed from segkmem_xfree(): */ + pp = page_find(vp, (u_offset_t)pos); + VERIFY(pp != NULL); + if (!page_tryupgrade(pp)) { + /* + * Some other thread has a sharelock. Wait for + * it to drop the lock so we can free this page. + */ + page_unlock(pp); + pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); + } + + /* + * Clear p_lckcnt so page_destroy() doesn't update availrmem. + * That will be taken care of later via page_unresv(). + */ + pp->p_lckcnt = 0; + page_destroy(pp, 0); + } +} + +static int +vmmr_alloc_pages(const vmmr_span_t *span) +{ + struct seg kseg = { + .s_as = &kas + }; + struct vnode *vp = &kvps[KV_VVP]; + + const uintptr_t end = span->vs_addr + span->vs_size; + for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { + page_t *pp; + + pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, + PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); + + if (pp == NULL) { + /* Destroy any already-created pages */ + if (pos != span->vs_addr) { + vmmr_span_t destroy_span = { + .vs_addr = span->vs_addr, + .vs_size = pos - span->vs_addr, + }; + + vmmr_destroy_pages(&destroy_span); + } + return (ENOMEM); + } + + /* mimic page state from segkmem */ + ASSERT(PAGE_EXCL(pp)); + page_io_unlock(pp); + pp->p_lckcnt = 1; + page_downgrade(pp); + + /* pre-zero the page */ + bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); + } + + return (0); +} + +static int +vmmr_resv_wait() +{ + if (delay_sig(hz >> 2) != 0) { + /* bail due to interruption */ + return (0); + } + return (1); +} + +static void +vmmr_remove_raw(size_t sz) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + VERIFY(MUTEX_HELD(&vmmr_lock)); + + size_t remain = sz; + while (remain > 0) { + vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); + + /* + * The caller must ensure that at least `sz` amount is present + * in the free treepair. + */ + VERIFY3P(span, !=, NULL); + ASSERT3U(span->vs_size, <=, remain); + + /* TODO: perhaps arrange to destroy pages outside the lock? */ + vmmr_destroy_pages(span); + + remain -= span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_empty_tp); + } + + vmmr_empty_sz += sz; +} + +int +vmmr_add(size_t sz, bool transient) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + mutex_enter(&vmmr_lock); + /* + * Make sure that the amount added is not going to breach the limits + * we've chosen + */ + const size_t current_total = + vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + + vmmr_alloc_transient_sz + vmmr_free_transient_sz; + if ((current_total + sz) < current_total) { + mutex_exit(&vmmr_lock); + return (EOVERFLOW); + } + if ((current_total + sz) > vmmr_total_limit) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + vmmr_adding_sz += sz; + mutex_exit(&vmmr_lock); + + /* Wait for enough pages to become available */ + if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { + mutex_enter(&vmmr_lock); + vmmr_adding_sz -= sz; + mutex_exit(&vmmr_lock); + + return (EINTR); + } + + mutex_enter(&vmmr_lock); + size_t added = 0; + size_t remain = sz; + while (added < sz) { + vmmr_span_t *span = NULL; + + if (vmmr_empty_sz > 0) { + span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); + + vmmr_empty_sz -= span->vs_size; + } else { + /* + * No empty space to fill with new pages, so just tack + * it on at the end instead. + */ + span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); + span->vs_addr = vmmr_empty_last; + span->vs_size = remain; + vmmr_empty_last += remain; + } + VERIFY3P(span, !=, NULL); + + + /* Allocate the actual pages to back this span */ + mutex_exit(&vmmr_lock); + int err = vmmr_alloc_pages(span); + mutex_enter(&vmmr_lock); + + /* + * If an error is encountered during page allocation for the + * span, unwind any progress made by the addition request. + */ + if (err != 0) { + /* + * Without pages allocated to this span, it is now + * tracked as empty. + */ + vmmr_empty_sz += span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_empty_tp); + + if (added != 0) { + vmmr_remove_raw(added); + } + + vmmr_adding_sz -= sz; + mutex_exit(&vmmr_lock); + + page_unresv(sz >> PAGESHIFT); + return (err); + } + + /* + * The allocated-page-bearing span is placed in the "free" + * treepair now, but is not officially exposed for consumption + * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. + * + * This allows us to unwind the allocation in case of a failure + * without the risk of the freshly added span(s) being snapped + * up by a consumer already. + */ + added += span->vs_size; + remain -= span->vs_size; + vmmr_tp_insert_concat(span, &vmmr_free_tp); + } + + /* Make the added memory usable by exposing it to the size accounting */ + if (!transient) { + vmmr_free_sz += added; + } else { + vmmr_free_transient_sz += added; + } + ASSERT3U(added, ==, sz); + vmmr_adding_sz -= added; + + mutex_exit(&vmmr_lock); + return (0); +} + +int +vmmr_remove(size_t sz, bool transient) +{ + VERIFY3U(sz & PAGEOFFSET, ==, 0); + + mutex_enter(&vmmr_lock); + if ((!transient && sz > vmmr_free_sz) || + (transient && sz > vmmr_free_transient_sz)) { + mutex_exit(&vmmr_lock); + return (ENOSPC); + } + + vmmr_remove_raw(sz); + + if (!transient) { + vmmr_free_sz -= sz; + } else { + vmmr_free_transient_sz -= sz; + } + mutex_exit(&vmmr_lock); + page_unresv(sz >> PAGESHIFT); + return (0); +} + +int +vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) +{ + switch (cmd) { + case VMM_RESV_QUERY: { + struct vmm_resv_query res; + void *datap = (void *)(uintptr_t)arg; + + /* For now, anyone in GZ can query */ + if (crgetzoneid(cr) != GLOBAL_ZONEID) { + return (EPERM); + } + mutex_enter(&vmmr_lock); + res.vrq_free_sz = vmmr_free_sz; + res.vrq_alloc_sz = vmmr_alloc_sz; + res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; + res.vrq_limit = vmmr_total_limit; + mutex_exit(&vmmr_lock); + if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { + return (EFAULT); + } + break; + } + case VMM_RESV_ADD: { + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (EPERM); + } + return (vmmr_add((size_t)arg, false)); + } + case VMM_RESV_REMOVE: { + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (EPERM); + } + return (vmmr_remove((size_t)arg, false)); + } + default: + return (ENOTTY); + } + return (0); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_dev.c b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c new file mode 100644 index 0000000000..eacad25e5d --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c @@ -0,0 +1,2894 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/cpuvar.h> +#include <sys/ioccom.h> +#include <sys/stat.h> +#include <sys/vmsystm.h> +#include <sys/ddi.h> +#include <sys/mkdev.h> +#include <sys/sunddi.h> +#include <sys/fs/dv_node.h> +#include <sys/cpuset.h> +#include <sys/id_space.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/smt.h> +#include <sys/kstat.h> + +#include <sys/kernel.h> +#include <sys/hma.h> +#include <sys/x86_archext.h> +#include <x86/apicreg.h> + +#include <sys/vmm.h> +#include <sys/vmm_kernel.h> +#include <sys/vmm_instruction_emul.h> +#include <sys/vmm_dev.h> +#include <sys/vmm_impl.h> +#include <sys/vmm_drv.h> +#include <sys/vmm_vm.h> +#include <sys/vmm_reservoir.h> + +#include <vm/seg_dev.h> + +#include "io/ppt.h" +#include "io/vatpic.h" +#include "io/vioapic.h" +#include "io/vrtc.h" +#include "io/vhpet.h" +#include "io/vpmtmr.h" +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_util.h" + +/* + * Locking details: + * + * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is + * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data + * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire + * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to + * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. + */ + +static kmutex_t vmmdev_mtx; +static dev_info_t *vmmdev_dip; +static hma_reg_t *vmmdev_hma_reg; +static uint_t vmmdev_hma_ref; +static sdev_plugin_hdl_t vmmdev_sdev_hdl; + +static kmutex_t vmm_mtx; +static list_t vmm_list; +static list_t vmm_destroy_list; +static id_space_t *vmm_minors; +static void *vmm_statep; + +static const char *vmmdev_hvm_name = "bhyve"; + +/* For sdev plugin (/dev) */ +#define VMM_SDEV_ROOT "/dev/vmm" + +/* From uts/intel/io/vmm/intel/vmx.c */ +extern int vmx_x86_supported(const char **); + +/* Holds and hooks from drivers external to vmm */ +struct vmm_hold { + list_node_t vmh_node; + vmm_softc_t *vmh_sc; + boolean_t vmh_release_req; + uint_t vmh_ioport_hook_cnt; +}; + +struct vmm_lease { + list_node_t vml_node; + struct vm *vml_vm; + vm_client_t *vml_vmclient; + boolean_t vml_expired; + boolean_t vml_break_deferred; + boolean_t (*vml_expire_func)(void *); + void *vml_expire_arg; + struct vmm_hold *vml_hold; +}; + +static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); +static void vmm_lease_block(vmm_softc_t *); +static void vmm_lease_unblock(vmm_softc_t *); +static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); +static void vmm_kstat_init(vmm_softc_t *); +static void vmm_kstat_fini(vmm_softc_t *); + +/* + * The 'devmem' hack: + * + * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments + * in the vm which appear with their own name related to the vm under /dev. + * Since this would be a hassle from an sdev perspective and would require a + * new cdev interface (or complicate the existing one), we choose to implement + * this in a different manner. Direct access to the underlying vm memory + * segments is exposed by placing them in a range of offsets beyond the normal + * guest memory space. Userspace can query the appropriate offset to mmap() + * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. + */ + +static vmm_devmem_entry_t * +vmmdev_devmem_find(vmm_softc_t *sc, int segid) +{ + vmm_devmem_entry_t *ent = NULL; + list_t *dl = &sc->vmm_devmem_list; + + for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { + if (ent->vde_segid == segid) { + return (ent); + } + } + return (NULL); +} + +static int +vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) +{ + int error; + bool sysmem; + + error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, + NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + vmm_devmem_entry_t *de; + + de = vmmdev_devmem_find(sc, mseg->segid); + if (de != NULL) { + (void) strlcpy(mseg->name, de->vde_name, + sizeof (mseg->name)); + } + } else { + bzero(mseg->name, sizeof (mseg->name)); + } + + return (error); +} + +static int +vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) +{ + off_t map_offset; + vmm_devmem_entry_t *entry; + + if (list_is_empty(&sc->vmm_devmem_list)) { + map_offset = VM_DEVMEM_START; + } else { + entry = list_tail(&sc->vmm_devmem_list); + map_offset = entry->vde_off + entry->vde_len; + if (map_offset < entry->vde_off) { + /* Do not tolerate overflow */ + return (ERANGE); + } + /* + * XXXJOY: We could choose to search the list for duplicate + * names and toss an error. Since we're using the offset + * method for now, it does not make much of a difference. + */ + } + + entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); + entry->vde_segid = mseg->segid; + entry->vde_len = mseg->len; + entry->vde_off = map_offset; + (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); + list_insert_tail(&sc->vmm_devmem_list, entry); + + return (0); +} + +static boolean_t +vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, + off_t *map_offp) +{ + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; + const off_t map_end = off + len; + + VERIFY(off >= VM_DEVMEM_START); + + if (map_end < off) { + /* No match on overflow */ + return (B_FALSE); + } + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + const off_t item_end = de->vde_off + de->vde_len; + + if (de->vde_off <= off && item_end >= map_end) { + *segidp = de->vde_segid; + *map_offp = off - de->vde_off; + return (B_TRUE); + } + } + return (B_FALSE); +} + +static void +vmmdev_devmem_purge(vmm_softc_t *sc) +{ + vmm_devmem_entry_t *entry; + + while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { + kmem_free(entry, sizeof (*entry)); + } +} + +static int +vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) +{ + int error; + bool sysmem = true; + + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + } + error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); + + if (error == 0) { + /* + * Rather than create a whole fresh device from which userspace + * can mmap this segment, instead make it available at an + * offset above where the main guest memory resides. + */ + error = vmmdev_devmem_create(sc, mseg, mseg->name); + if (error != 0) { + vm_free_memseg(sc->vmm_vm, mseg->segid); + } + } + return (error); +} + +/* + * Resource Locking and Exclusion + * + * Much of bhyve depends on key portions of VM state, such as the guest memory + * map, to remain unchanged while the guest is running. As ported from + * FreeBSD, the initial strategy for this resource exclusion hinged on gating + * access to the instance vCPUs. Threads acting on a single vCPU, like those + * performing the work of actually running the guest in VMX/SVM, would lock + * only that vCPU during ioctl() entry. For ioctls which would change VM-wide + * state, all of the vCPUs would be first locked, ensuring that the + * operation(s) could complete without any other threads stumbling into + * intermediate states. + * + * This approach is largely effective for bhyve. Common operations, such as + * running the vCPUs, steer clear of lock contention. The model begins to + * break down for operations which do not occur in the context of a specific + * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker + * thread in the bhyve process. In order to properly protect those vCPU-less + * operations from encountering invalid states, additional locking is required. + * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. + * It does mean that class of operations will be serialized on locking the + * specific vCPU and that instances sized at VM_MAXCPU will potentially see + * undue contention on the VM_MAXCPU-1 vCPU. + * + * In order to address the shortcomings of this model, the concept of a + * read/write lock has been added to bhyve. Operations which change + * fundamental aspects of a VM (such as the memory map) must acquire the write + * lock, which also implies locking all of the vCPUs and waiting for all read + * lock holders to release. While it increases the cost and waiting time for + * those few operations, it allows most hot-path operations on the VM (which + * depend on its configuration remaining stable) to occur with minimal locking. + * + * Consumers of the Driver API (see below) are a special case when it comes to + * this locking, since they may hold a read lock via the drv_lease mechanism + * for an extended period of time. Rather than forcing those consumers to + * continuously poll for a write lock attempt, the lease system forces them to + * provide a release callback to trigger their clean-up (and potential later + * reacquisition) of the read lock. + */ + +static void +vcpu_lock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); + + /* + * Since this state transition is utilizing from_idle=true, it should + * not fail, but rather block until it can be successful. + */ + VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); +} + +static void +vcpu_unlock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); + + VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); + vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); +} + +static void +vmm_read_lock(vmm_softc_t *sc) +{ + rw_enter(&sc->vmm_rwlock, RW_READER); +} + +static void +vmm_read_unlock(vmm_softc_t *sc) +{ + rw_exit(&sc->vmm_rwlock); +} + +static void +vmm_write_lock(vmm_softc_t *sc) +{ + int maxcpus; + + /* First lock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_lock_one(sc, vcpu); + } + + /* + * Block vmm_drv leases from being acquired or held while the VM write + * lock is held. + */ + vmm_lease_block(sc); + + rw_enter(&sc->vmm_rwlock, RW_WRITER); + /* + * For now, the 'maxcpus' value for an instance is fixed at the + * compile-time constant of VM_MAXCPU at creation. If this changes in + * the future, allowing for dynamic vCPU resource sizing, acquisition + * of the write lock will need to be wary of such changes. + */ + VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); +} + +static void +vmm_write_unlock(vmm_softc_t *sc) +{ + int maxcpus; + + /* Allow vmm_drv leases to be acquired once write lock is dropped */ + vmm_lease_unblock(sc); + + /* + * The VM write lock _must_ be released from the same thread it was + * acquired in, unlike the read lock. + */ + VERIFY(rw_write_held(&sc->vmm_rwlock)); + rw_exit(&sc->vmm_rwlock); + + /* Unlock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_unlock_one(sc, vcpu); + } +} + +static int +vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, + cred_t *credp, int *rvalp) +{ + int error = 0, vcpu = -1; + void *datap = (void *)arg; + enum vm_lock_type { + LOCK_NONE = 0, + LOCK_VCPU, + LOCK_READ_HOLD, + LOCK_WRITE_HOLD + } lock_type = LOCK_NONE; + + /* Acquire any exclusion resources needed for the operation. */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + case VM_PPTDEV_MSIX: + case VM_SET_X2APIC_STATE: + case VM_GLA2GPA: + case VM_GLA2GPA_NOFAULT: + case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: + case VM_RESTART_INSTRUCTION: + case VM_SET_KERNEMU_DEV: + case VM_GET_KERNEMU_DEV: + case VM_RESET_CPU: + case VM_GET_RUN_STATE: + case VM_SET_RUN_STATE: + case VM_GET_FPU: + case VM_SET_FPU: + /* + * Copy in the ID of the vCPU chosen for this operation. + * Since a nefarious caller could update their struct between + * this locking and when the rest of the ioctl data is copied + * in, it is _critical_ that this local 'vcpu' variable be used + * rather than the in-struct one when performing the ioctl. + */ + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + return (EFAULT); + } + if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { + return (EINVAL); + } + vcpu_lock_one(sc, vcpu); + lock_type = LOCK_VCPU; + break; + + case VM_REINIT: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_PPTDEV_MMIO: + case VM_UNMAP_PPTDEV_MMIO: + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_MUNMAP_MEMSEG: + case VM_WRLOCK_CYCLE: + case VM_PMTMR_LOCATE: + vmm_write_lock(sc); + lock_type = LOCK_WRITE_HOLD; + break; + + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + case VM_LAPIC_IRQ: + case VM_INJECT_NMI: + case VM_IOAPIC_ASSERT_IRQ: + case VM_IOAPIC_DEASSERT_IRQ: + case VM_IOAPIC_PULSE_IRQ: + case VM_LAPIC_MSI: + case VM_LAPIC_LOCAL_IRQ: + case VM_GET_X2APIC_STATE: + case VM_RTC_READ: + case VM_RTC_WRITE: + case VM_RTC_SETTIME: + case VM_RTC_GETTIME: + case VM_PPTDEV_DISABLE_MSIX: + case VM_DEVMEM_GETOFFSET: + case VM_TRACK_DIRTY_PAGES: + vmm_read_lock(sc); + lock_type = LOCK_READ_HOLD; + break; + + case VM_GET_GPA_PMAP: + case VM_IOAPIC_PINCOUNT: + case VM_SUSPEND: + case VM_DESC_FPU_AREA: + default: + break; + } + + /* Execute the primary logic for the ioctl. */ + switch (cmd) { + case VM_RUN: { + struct vm_entry entry; + + if (ddi_copyin(datap, &entry, sizeof (entry), md)) { + error = EFAULT; + break; + } + + if (!(curthread->t_schedflag & TS_VCPU)) + smt_mark_as_vcpu(); + + error = vm_run(sc->vmm_vm, vcpu, &entry); + + /* + * Unexpected states in vm_run() are expressed through positive + * errno-oriented return values. VM states which expect further + * processing in userspace (necessary context via exitinfo) are + * expressed through negative return values. For the time being + * a return value of 0 is not expected from vm_run(). + */ + ASSERT(error != 0); + if (error < 0) { + const struct vm_exit *vme; + void *outp = entry.exit_data; + + error = 0; + vme = vm_exitinfo(sc->vmm_vm, vcpu); + if (ddi_copyout(vme, outp, sizeof (*vme), md)) { + error = EFAULT; + } + } + break; + } + case VM_SUSPEND: { + struct vm_suspend vmsuspend; + + if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { + error = EFAULT; + break; + } + error = vm_suspend(sc->vmm_vm, vmsuspend.how); + break; + } + case VM_REINIT: { + struct vm_reinit reinit; + + if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { + error = EFAULT; + break; + } + if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { + /* + * The VM instance should be free of driver-attached + * hooks during the reinitialization process. + */ + break; + } + error = vm_reinit(sc->vmm_vm, reinit.flags); + (void) vmm_drv_block_hook(sc, B_FALSE); + break; + } + case VM_STAT_DESC: { + struct vm_stat_desc statdesc; + + if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { + error = EFAULT; + break; + } + error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, + sizeof (statdesc.desc)); + if (error == 0 && + ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { + error = EFAULT; + break; + } + break; + } + case VM_STATS_IOC: { + struct vm_stats vmstats; + + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { + error = EFAULT; + break; + } + hrt2tv(gethrtime(), &vmstats.tv); + error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, + &vmstats.num_entries, vmstats.statbuf); + if (error == 0 && + ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi pptmsi; + + if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { + error = EFAULT; + break; + } + error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, + pptmsi.addr, pptmsi.msg, pptmsi.numvec); + break; + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix pptmsix; + + if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { + error = EFAULT; + break; + } + error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, + pptmsix.idx, pptmsix.addr, pptmsix.msg, + pptmsix.vector_control); + break; + } + case VM_PPTDEV_DISABLE_MSIX: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); + break; + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio pptmmio; + + if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { + error = EFAULT; + break; + } + error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, + pptmmio.len, pptmmio.hpa); + break; + } + case VM_UNMAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio pptmmio; + + if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { + error = EFAULT; + break; + } + error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, + pptmmio.len); + break; + } + case VM_BIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); + break; + } + case VM_UNBIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); + break; + } + case VM_GET_PPTDEV_LIMITS: { + struct vm_pptdev_limits pptlimits; + + if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { + error = EFAULT; + break; + } + error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, + &pptlimits.msi_limit, &pptlimits.msix_limit); + if (error == 0 && + ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { + error = EFAULT; + break; + } + break; + } + case VM_INJECT_EXCEPTION: { + struct vm_exception vmexc; + if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { + error = EFAULT; + break; + } + error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, + vmexc.error_code_valid, vmexc.error_code, + vmexc.restart_instruction); + break; + } + case VM_INJECT_NMI: { + struct vm_nmi vmnmi; + + if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { + error = EFAULT; + break; + } + error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); + break; + } + case VM_LAPIC_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; + } + error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); + break; + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; + } + error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, + vmirq.vector); + break; + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi vmmsi; + + if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { + error = EFAULT; + break; + } + error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); + break; + } + + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; + } + error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_PINCOUNT: { + int pincount; + + pincount = vioapic_pincount(sc->vmm_vm); + if (ddi_copyout(&pincount, datap, sizeof (int), md)) { + error = EFAULT; + break; + } + break; + } + case VM_DESC_FPU_AREA: { + struct vm_fpu_desc desc; + void *buf = NULL; + + if (ddi_copyin(datap, &desc, sizeof (desc), md)) { + error = EFAULT; + break; + } + if (desc.vfd_num_entries > 64) { + error = EINVAL; + break; + } + const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * + desc.vfd_num_entries; + if (buf_sz != 0) { + buf = kmem_zalloc(buf_sz, KM_SLEEP); + } + + /* + * For now, we are depending on vm_fpu_desc_entry and + * hma_xsave_state_desc_t having the same format. + */ + CTASSERT(sizeof (struct vm_fpu_desc_entry) == + sizeof (hma_xsave_state_desc_t)); + + size_t req_size; + const uint_t max_entries = hma_fpu_describe_xsave_state( + (hma_xsave_state_desc_t *)buf, + desc.vfd_num_entries, + &req_size); + + desc.vfd_req_size = req_size; + desc.vfd_num_entries = max_entries; + if (buf_sz != 0) { + if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { + error = EFAULT; + } + kmem_free(buf, buf_sz); + } + + if (error == 0) { + if (ddi_copyout(&desc, datap, sizeof (desc), md)) { + error = EFAULT; + } + } + break; + } + + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_assert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_deassert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_pulse_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger isa_irq_trigger; + + if (ddi_copyin(datap, &isa_irq_trigger, + sizeof (isa_irq_trigger), md)) { + error = EFAULT; + break; + } + error = vatpic_set_irq_trigger(sc->vmm_vm, + isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); + break; + } + + case VM_MMAP_GETNEXT: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, + &mm.segoff, &mm.len, &mm.prot, &mm.flags); + if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { + error = EFAULT; + break; + } + break; + } + case VM_MMAP_MEMSEG: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, + mm.len, mm.prot, mm.flags); + break; + } + case VM_MUNMAP_MEMSEG: { + struct vm_munmap mu; + + if (ddi_copyin(datap, &mu, sizeof (mu), md)) { + error = EFAULT; + break; + } + error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); + break; + } + case VM_ALLOC_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_alloc_memseg(sc, &vmseg); + break; + } + case VM_GET_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_get_memseg(sc, &vmseg); + if (error == 0 && + ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, + &vmreg.regval); + if (error == 0 && + ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, + vmreg.regval); + break; + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + break; + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + if (error == 0 && + ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + if (regnums[i] < 0) { + error = EINVAL; + break; + } + error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], + ®vals[i]); + } + if (error == 0 && ddi_copyout(regvals, vrs.regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + } + break; + } + case VM_SET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + if (ddi_copyin(vrs.regvals, regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + /* + * Setting registers in a set is not atomic, since a + * failure in the middle of the set will cause a + * bail-out and inconsistent register state. Callers + * should be wary of this. + */ + if (regnums[i] < 0) { + error = EINVAL; + break; + } + error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], + regvals[i]); + } + break; + } + case VM_RESET_CPU: { + struct vm_vcpu_reset vvr; + + if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { + error = EFAULT; + break; + } + if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { + error = EINVAL; + } + + error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); + break; + } + case VM_GET_RUN_STATE: { + struct vm_run_state vrs; + + bzero(&vrs, sizeof (vrs)); + error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, + &vrs.sipi_vector); + if (error == 0) { + if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { + error = EFAULT; + break; + } + } + break; + } + case VM_SET_RUN_STATE: { + struct vm_run_state vrs; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, + vrs.sipi_vector); + break; + } + case VM_GET_FPU: { + struct vm_fpu_state req; + const size_t max_len = (PAGESIZE * 2); + void *kbuf; + + if (ddi_copyin(datap, &req, sizeof (req), md)) { + error = EFAULT; + break; + } + if (req.len > max_len || req.len == 0) { + error = EINVAL; + break; + } + kbuf = kmem_zalloc(req.len, KM_SLEEP); + error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); + if (error == 0) { + if (ddi_copyout(kbuf, req.buf, req.len, md)) { + error = EFAULT; + } + } + kmem_free(kbuf, req.len); + break; + } + case VM_SET_FPU: { + struct vm_fpu_state req; + const size_t max_len = (PAGESIZE * 2); + void *kbuf; + + if (ddi_copyin(datap, &req, sizeof (req), md)) { + error = EFAULT; + break; + } + if (req.len > max_len || req.len == 0) { + error = EINVAL; + break; + } + kbuf = kmem_alloc(req.len, KM_SLEEP); + if (ddi_copyin(req.buf, kbuf, req.len, md)) { + error = EFAULT; + } else { + error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); + } + kmem_free(kbuf, req.len); + break; + } + + case VM_SET_KERNEMU_DEV: + case VM_GET_KERNEMU_DEV: { + struct vm_readwrite_kernemu_device kemu; + size_t size = 0; + + if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { + error = EFAULT; + break; + } + + if (kemu.access_width > 3) { + error = EINVAL; + break; + } + size = (1 << kemu.access_width); + ASSERT(size >= 1 && size <= 8); + + if (cmd == VM_SET_KERNEMU_DEV) { + error = vm_service_mmio_write(sc->vmm_vm, vcpu, + kemu.gpa, kemu.value, size); + } else { + error = vm_service_mmio_read(sc->vmm_vm, vcpu, + kemu.gpa, &kemu.value, size); + } + + if (error == 0) { + if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { + error = EFAULT; + break; + } + } + break; + } + + case VM_GET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, + &vmcap.capval); + if (error == 0 && + ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, + vmcap.capval); + break; + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); + break; + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, + &x2apic.state); + if (error == 0 && + ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_GPA_PMAP: { + /* + * Until there is a necessity to leak EPT/RVI PTE values to + * userspace, this will remain unimplemented + */ + error = EINVAL; + break; + } + case VM_GET_HPET_CAPABILITIES: { + struct vm_hpet_cap hpetcap; + + error = vhpet_getcap(&hpetcap); + if (error == 0 && + ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GLA2GPA: { + struct vm_gla2gpa gg; + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, + gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa gg; + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, + gg.gla, gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_ACTIVATE_CPU: + error = vm_activate_cpu(sc->vmm_vm, vcpu); + break; + + case VM_SUSPEND_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_suspend_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_RESUME_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_resume_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_GET_CPUS: { + struct vm_cpuset vm_cpuset; + cpuset_t tempset; + void *srcp = &tempset; + int size; + + if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { + error = EFAULT; + break; + } + + /* Be more generous about sizing since our cpuset_t is large. */ + size = vm_cpuset.cpusetsize; + if (size <= 0 || size > sizeof (cpuset_t)) { + error = ERANGE; + } + /* + * If they want a ulong_t or less, make sure they receive the + * low bits with all the useful information. + */ + if (size <= sizeof (tempset.cpub[0])) { + srcp = &tempset.cpub[0]; + } + + if (vm_cpuset.which == VM_ACTIVE_CPUS) { + tempset = vm_active_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { + tempset = vm_suspended_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_DEBUG_CPUS) { + tempset = vm_debug_cpus(sc->vmm_vm); + } else { + error = EINVAL; + } + + ASSERT(size > 0 && size <= sizeof (tempset)); + if (error == 0 && + ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_INTINFO: { + struct vm_intinfo vmii; + + if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { + error = EFAULT; + break; + } + error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); + break; + } + case VM_GET_INTINFO: { + struct vm_intinfo vmii; + + vmii.vcpuid = vcpu; + error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, + &vmii.info2); + if (error == 0 && + ddi_copyout(&vmii, datap, sizeof (vmii), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_WRITE: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, + rtcdata.value); + break; + } + case VM_RTC_READ: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, + &rtcdata.value); + if (error == 0 && + ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_SETTIME: { + struct vm_rtc_time rtctime; + + if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + error = vrtc_set_time(sc->vmm_vm, rtctime.secs); + break; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time rtctime; + + rtctime.secs = vrtc_get_time(sc->vmm_vm); + if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_PMTMR_LOCATE: { + uint16_t port = arg; + error = vpmtmr_set_location(sc->vmm_vm, port); + break; + } + + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vmm_vm, vcpu); + break; + + case VM_SET_TOPOLOGY: { + struct vm_cpu_topology topo; + + if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, + topo.threads, topo.maxcpus); + break; + } + case VM_GET_TOPOLOGY: { + struct vm_cpu_topology topo; + + vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, + &topo.threads, &topo.maxcpus); + if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + break; + } + case VM_DEVMEM_GETOFFSET: { + struct vm_devmem_offset vdo; + vmm_devmem_entry_t *de; + + if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { + error = EFAULT; + break; + } + + de = vmmdev_devmem_find(sc, vdo.segid); + if (de != NULL) { + vdo.offset = de->vde_off; + if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { + error = EFAULT; + } + } else { + error = ENOENT; + } + break; + } + case VM_TRACK_DIRTY_PAGES: { + const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; + struct vmm_dirty_tracker tracker; + uint8_t *bitmap; + size_t len; + + if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { + error = EFAULT; + break; + } + if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { + error = EINVAL; + break; + } + if (tracker.vdt_len == 0) { + break; + } + if ((tracker.vdt_len & PAGEOFFSET) != 0) { + error = EINVAL; + break; + } + if (tracker.vdt_len > max_track_region_len) { + error = EINVAL; + break; + } + len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; + bitmap = kmem_zalloc(len, KM_SLEEP); + vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, + tracker.vdt_len, bitmap); + if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { + error = EFAULT; + } + kmem_free(bitmap, len); + + break; + } + case VM_WRLOCK_CYCLE: { + /* + * Present a test mechanism to acquire/release the write lock + * on the VM without any other effects. + */ + break; + } + + default: + error = ENOTTY; + break; + } + + /* Release exclusion resources */ + switch (lock_type) { + case LOCK_NONE: + break; + case LOCK_VCPU: + vcpu_unlock_one(sc, vcpu); + break; + case LOCK_READ_HOLD: + vmm_read_unlock(sc); + break; + case LOCK_WRITE_HOLD: + vmm_write_unlock(sc); + break; + default: + panic("unexpected lock type"); + break; + } + + return (error); +} + +static vmm_softc_t * +vmm_lookup(const char *name) +{ + list_t *vml = &vmm_list; + vmm_softc_t *sc; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { + if (strcmp(sc->vmm_name, name) == 0) { + break; + } + } + + return (sc); +} + +/* + * Acquire an HMA registration if not already held. + */ +static boolean_t +vmm_hma_acquire(void) +{ + ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); + + mutex_enter(&vmmdev_mtx); + + if (vmmdev_hma_reg == NULL) { + VERIFY3U(vmmdev_hma_ref, ==, 0); + vmmdev_hma_reg = hma_register(vmmdev_hvm_name); + if (vmmdev_hma_reg == NULL) { + cmn_err(CE_WARN, "%s HMA registration failed.", + vmmdev_hvm_name); + mutex_exit(&vmmdev_mtx); + return (B_FALSE); + } + } + + vmmdev_hma_ref++; + + mutex_exit(&vmmdev_mtx); + + return (B_TRUE); +} + +/* + * Release the HMA registration if held and there are no remaining VMs. + */ +static void +vmm_hma_release(void) +{ + ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); + + mutex_enter(&vmmdev_mtx); + + VERIFY3U(vmmdev_hma_ref, !=, 0); + + vmmdev_hma_ref--; + + if (vmmdev_hma_ref == 0) { + VERIFY(vmmdev_hma_reg != NULL); + hma_unregister(vmmdev_hma_reg); + vmmdev_hma_reg = NULL; + } + mutex_exit(&vmmdev_mtx); +} + +static int +vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) +{ + vmm_softc_t *sc = NULL; + minor_t minor; + int error = ENOMEM; + size_t len; + const char *name = req->name; + + len = strnlen(name, VM_MAX_NAMELEN); + if (len == 0) { + return (EINVAL); + } + if (len >= VM_MAX_NAMELEN) { + return (ENAMETOOLONG); + } + if (strchr(name, '/') != NULL) { + return (EINVAL); + } + + if (!vmm_hma_acquire()) + return (ENXIO); + + mutex_enter(&vmm_mtx); + + /* Look for duplicate names */ + if (vmm_lookup(name) != NULL) { + mutex_exit(&vmm_mtx); + vmm_hma_release(); + return (EEXIST); + } + + /* Allow only one instance per non-global zone. */ + if (!INGLOBALZONE(curproc)) { + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (sc->vmm_zone == curzone) { + mutex_exit(&vmm_mtx); + vmm_hma_release(); + return (EINVAL); + } + } + } + + minor = id_alloc(vmm_minors); + if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { + goto fail; + } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + ddi_soft_state_free(vmm_statep, minor); + goto fail; + } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; + } + + if (vmm_kstat_alloc(sc, minor, cr) != 0) { + goto fail; + } + + error = vm_create(req->name, req->flags, &sc->vmm_vm); + if (error == 0) { + /* Complete VM intialization and report success. */ + (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); + sc->vmm_minor = minor; + list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), + offsetof(vmm_devmem_entry_t, vde_node)); + + list_create(&sc->vmm_holds, sizeof (vmm_hold_t), + offsetof(vmm_hold_t, vmh_node)); + cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), + offsetof(vmm_lease_t, vml_node)); + cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); + rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); + + sc->vmm_zone = crgetzone(cr); + zone_hold(sc->vmm_zone); + vmm_zsd_add_vm(sc); + vmm_kstat_init(sc); + + list_insert_tail(&vmm_list, sc); + mutex_exit(&vmm_mtx); + return (0); + } + + vmm_kstat_fini(sc); + ddi_remove_minor_node(vmmdev_dip, name); +fail: + id_free(vmm_minors, minor); + if (sc != NULL) { + ddi_soft_state_free(vmm_statep, minor); + } + mutex_exit(&vmm_mtx); + vmm_hma_release(); + + return (error); +} + +/* + * Bhyve 'Driver' Interface + * + * While many devices are emulated in the bhyve userspace process, there are + * others with performance constraints which require that they run mostly or + * entirely in-kernel. For those not integrated directly into bhyve, an API is + * needed so they can query/manipulate the portions of VM state needed to + * fulfill their purpose. + * + * This includes: + * - Translating guest-physical addresses to host-virtual pointers + * - Injecting MSIs + * - Hooking IO port addresses + * + * The vmm_drv interface exists to provide that functionality to its consumers. + * (At this time, 'viona' is the only user) + */ +int +vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) +{ + vnode_t *vp = fp->f_vnode; + const dev_t dev = vp->v_rdev; + vmm_softc_t *sc; + vmm_hold_t *hold; + int err = 0; + + if (vp->v_type != VCHR) { + return (ENXIO); + } + const major_t major = getmajor(dev); + const minor_t minor = getminor(dev); + + mutex_enter(&vmmdev_mtx); + if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { + mutex_exit(&vmmdev_mtx); + return (ENOENT); + } + mutex_enter(&vmm_mtx); + mutex_exit(&vmmdev_mtx); + + if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + err = ENOENT; + goto out; + } + /* XXXJOY: check cred permissions against instance */ + + if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { + err = EBUSY; + goto out; + } + + hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); + hold->vmh_sc = sc; + hold->vmh_release_req = B_FALSE; + + list_insert_tail(&sc->vmm_holds, hold); + sc->vmm_flags |= VMM_HELD; + *holdp = hold; + +out: + mutex_exit(&vmm_mtx); + return (err); +} + +void +vmm_drv_rele(vmm_hold_t *hold) +{ + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(hold->vmh_sc != NULL); + VERIFY(hold->vmh_ioport_hook_cnt == 0); + + mutex_enter(&vmm_mtx); + sc = hold->vmh_sc; + list_remove(&sc->vmm_holds, hold); + if (list_is_empty(&sc->vmm_holds)) { + sc->vmm_flags &= ~VMM_HELD; + cv_broadcast(&sc->vmm_cv); + } + mutex_exit(&vmm_mtx); + kmem_free(hold, sizeof (*hold)); +} + +boolean_t +vmm_drv_release_reqd(vmm_hold_t *hold) +{ + ASSERT(hold != NULL); + + return (hold->vmh_release_req); +} + +vmm_lease_t * +vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) +{ + vmm_softc_t *sc = hold->vmh_sc; + vmm_lease_t *lease; + + ASSERT3P(expiref, !=, NULL); + + if (hold->vmh_release_req) { + return (NULL); + } + + lease = kmem_alloc(sizeof (*lease), KM_SLEEP); + list_link_init(&lease->vml_node); + lease->vml_expire_func = expiref; + lease->vml_expire_arg = arg; + lease->vml_expired = B_FALSE; + lease->vml_break_deferred = B_FALSE; + lease->vml_hold = hold; + /* cache the VM pointer for one less pointer chase */ + lease->vml_vm = sc->vmm_vm; + lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); + + mutex_enter(&sc->vmm_lease_lock); + while (sc->vmm_lease_blocker != 0) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + list_insert_tail(&sc->vmm_lease_list, lease); + vmm_read_lock(sc); + mutex_exit(&sc->vmm_lease_lock); + + return (lease); +} + +static void +vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) +{ + ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); + + list_remove(&sc->vmm_lease_list, lease); + vmm_read_unlock(sc); + vmc_destroy(lease->vml_vmclient); + kmem_free(lease, sizeof (*lease)); +} + +static void +vmm_lease_block(vmm_softc_t *sc) +{ + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); + sc->vmm_lease_blocker++; + if (sc->vmm_lease_blocker == 1) { + list_t *list = &sc->vmm_lease_list; + vmm_lease_t *lease = list_head(list); + + while (lease != NULL) { + void *arg = lease->vml_expire_arg; + boolean_t (*expiref)(void *) = lease->vml_expire_func; + boolean_t sync_break = B_FALSE; + + /* + * Since the lease expiration notification may + * need to take locks which would deadlock with + * vmm_lease_lock, drop it across the call. + * + * We are the only one allowed to manipulate + * vmm_lease_list right now, so it is safe to + * continue iterating through it after + * reacquiring the lock. + */ + lease->vml_expired = B_TRUE; + mutex_exit(&sc->vmm_lease_lock); + sync_break = expiref(arg); + mutex_enter(&sc->vmm_lease_lock); + + if (sync_break) { + vmm_lease_t *next; + + /* + * These leases which are synchronously broken + * result in vmm_read_unlock() calls from a + * different thread than the corresponding + * vmm_read_lock(). This is acceptable, given + * that the rwlock underpinning the whole + * mechanism tolerates the behavior. This + * flexibility is _only_ afforded to VM read + * lock (RW_READER) holders. + */ + next = list_next(list, lease); + vmm_lease_break_locked(sc, lease); + lease = next; + } else { + lease = list_next(list, lease); + } + } + + /* Process leases which were not broken synchronously. */ + while (!list_is_empty(list)) { + /* + * Although the nested loops are quadratic, the number + * of leases is small. + */ + lease = list_head(list); + while (lease != NULL) { + vmm_lease_t *next = list_next(list, lease); + if (lease->vml_break_deferred) { + vmm_lease_break_locked(sc, lease); + } + lease = next; + } + if (list_is_empty(list)) { + break; + } + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + /* Wake anyone else waiting for the lease list to be empty */ + cv_broadcast(&sc->vmm_lease_cv); + } else { + list_t *list = &sc->vmm_lease_list; + + /* + * Some other thread beat us to the duty of lease cleanup. + * Wait until that is complete. + */ + while (!list_is_empty(list)) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + } + mutex_exit(&sc->vmm_lease_lock); +} + +static void +vmm_lease_unblock(vmm_softc_t *sc) +{ + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, 0); + sc->vmm_lease_blocker--; + if (sc->vmm_lease_blocker == 0) { + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); +} + +void +vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) +{ + vmm_softc_t *sc = hold->vmh_sc; + + VERIFY3P(hold, ==, lease->vml_hold); + VERIFY(!lease->vml_break_deferred); + + mutex_enter(&sc->vmm_lease_lock); + if (sc->vmm_lease_blocker == 0) { + vmm_lease_break_locked(sc, lease); + } else { + /* + * Defer the lease-breaking to whichever thread is currently + * cleaning up all leases as part of a vmm_lease_block() call. + */ + lease->vml_break_deferred = B_TRUE; + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); +} + +boolean_t +vmm_drv_lease_expired(vmm_lease_t *lease) +{ + return (lease->vml_expired); +} + +vmm_page_t * +vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) +{ + ASSERT(lease != NULL); + ASSERT0(gpa & PAGEOFFSET); + + return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); +} + +void +vmm_drv_page_release(vmm_page_t *vmmp) +{ + vmp_release((vm_page_t *)vmmp); +} + +void +vmm_drv_page_release_chain(vmm_page_t *vmmp) +{ + vmp_release_chain((vm_page_t *)vmmp); +} + +const void * +vmm_drv_page_readable(const vmm_page_t *vmmp) +{ + return (vmp_get_readable((const vm_page_t *)vmmp)); +} + +void * +vmm_drv_page_writable(const vmm_page_t *vmmp) +{ + return (vmp_get_writable((const vm_page_t *)vmmp)); +} + +void +vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) +{ + vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); +} + +vmm_page_t * +vmm_drv_page_next(const vmm_page_t *vmmp) +{ + return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); +} + +int +vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) +{ + ASSERT(lease != NULL); + + return (lapic_intr_msi(lease->vml_vm, addr, msg)); +} + +int +vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, + void *arg, void **cookie) +{ + vmm_softc_t *sc; + int err; + + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + + sc = hold->vmh_sc; + mutex_enter(&vmm_mtx); + /* Confirm that hook installation is not blocked */ + if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { + mutex_exit(&vmm_mtx); + return (EBUSY); + } + /* + * Optimistically record an installed hook which will prevent a block + * from being asserted while the mutex is dropped. + */ + hold->vmh_ioport_hook_cnt++; + mutex_exit(&vmm_mtx); + + vmm_write_lock(sc); + err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, + arg, cookie); + vmm_write_unlock(sc); + + if (err != 0) { + mutex_enter(&vmm_mtx); + /* Walk back optimism about the hook installation */ + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); + } + return (err); +} + +void +vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) +{ + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + ASSERT(hold->vmh_ioport_hook_cnt != 0); + + sc = hold->vmh_sc; + vmm_write_lock(sc); + vm_ioport_unhook(sc->vmm_vm, cookie); + vmm_write_unlock(sc); + + mutex_enter(&vmm_mtx); + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); +} + +static int +vmm_drv_purge(vmm_softc_t *sc) +{ + ASSERT(MUTEX_HELD(&vmm_mtx)); + + if ((sc->vmm_flags & VMM_HELD) != 0) { + vmm_hold_t *hold; + + sc->vmm_flags |= VMM_CLEANUP; + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + hold->vmh_release_req = B_TRUE; + } + while ((sc->vmm_flags & VMM_HELD) != 0) { + if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { + return (EINTR); + } + } + sc->vmm_flags &= ~VMM_CLEANUP; + } + + VERIFY(list_is_empty(&sc->vmm_holds)); + sc->vmm_flags |= VMM_PURGED; + return (0); +} + +static int +vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) +{ + int err = 0; + + mutex_enter(&vmm_mtx); + if (!enable_block) { + VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); + + sc->vmm_flags &= ~VMM_BLOCK_HOOK; + goto done; + } + + /* If any holds have hooks installed, the block is a failure */ + if (!list_is_empty(&sc->vmm_holds)) { + vmm_hold_t *hold; + + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + if (hold->vmh_ioport_hook_cnt != 0) { + err = EBUSY; + goto done; + } + } + } + sc->vmm_flags |= VMM_BLOCK_HOOK; + +done: + mutex_exit(&vmm_mtx); + return (err); +} + +static int +vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, + boolean_t *hma_release) +{ + dev_info_t *pdip = ddi_get_parent(vmmdev_dip); + minor_t minor; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + *hma_release = B_FALSE; + + if (vmm_drv_purge(sc) != 0) { + return (EINTR); + } + + if (clean_zsd) { + vmm_zsd_rem_vm(sc); + } + + /* Clean up devmem entries */ + vmmdev_devmem_purge(sc); + + list_remove(&vmm_list, sc); + ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); + minor = sc->vmm_minor; + zone_rele(sc->vmm_zone); + if (sc->vmm_is_open) { + list_insert_tail(&vmm_destroy_list, sc); + sc->vmm_flags |= VMM_DESTROY; + } else { + vmm_kstat_fini(sc); + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + *hma_release = B_TRUE; + } + (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); + + return (0); +} + +int +vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) +{ + boolean_t hma_release = B_FALSE; + int err; + + mutex_enter(&vmm_mtx); + err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); + mutex_exit(&vmm_mtx); + + if (hma_release) + vmm_hma_release(); + + return (err); +} + +/* ARGSUSED */ +static int +vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) +{ + boolean_t hma_release = B_FALSE; + vmm_softc_t *sc; + int err; + + if (crgetuid(cr) != 0) + return (EPERM); + + mutex_enter(&vmm_mtx); + + if ((sc = vmm_lookup(req->name)) == NULL) { + mutex_exit(&vmm_mtx); + return (ENOENT); + } + /* + * We don't check this in vmm_lookup() since that function is also used + * for validation during create and currently vmm names must be unique. + */ + if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { + mutex_exit(&vmm_mtx); + return (EPERM); + } + err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); + + mutex_exit(&vmm_mtx); + + if (hma_release) + vmm_hma_release(); + + return (err); +} + +#define VCPU_NAME_BUFLEN 32 + +static int +vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) +{ + zoneid_t zid = crgetzoneid(cr); + int instance = minor; + kstat_t *ksp; + + ASSERT3P(sc->vmm_kstat_vm, ==, NULL); + + ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", + VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, + sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); + + if (ksp == NULL) { + return (-1); + } + sc->vmm_kstat_vm = ksp; + + for (uint_t i = 0; i < VM_MAXCPU; i++) { + char namebuf[VCPU_NAME_BUFLEN]; + + ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); + + (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); + ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, + VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, + sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), + 0, zid); + if (ksp == NULL) { + goto fail; + } + + sc->vmm_kstat_vcpu[i] = ksp; + } + + /* + * If this instance is associated with a non-global zone, make its + * kstats visible from the GZ. + */ + if (zid != GLOBAL_ZONEID) { + kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); + for (uint_t i = 0; i < VM_MAXCPU; i++) { + kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); + } + } + + return (0); + +fail: + for (uint_t i = 0; i < VM_MAXCPU; i++) { + if (sc->vmm_kstat_vcpu[i] != NULL) { + kstat_delete(sc->vmm_kstat_vcpu[i]); + sc->vmm_kstat_vcpu[i] = NULL; + } else { + break; + } + } + kstat_delete(sc->vmm_kstat_vm); + sc->vmm_kstat_vm = NULL; + return (-1); +} + +static void +vmm_kstat_init(vmm_softc_t *sc) +{ + kstat_t *ksp; + + ASSERT3P(sc->vmm_vm, !=, NULL); + ASSERT3P(sc->vmm_kstat_vm, !=, NULL); + + ksp = sc->vmm_kstat_vm; + vmm_kstats_t *vk = ksp->ks_data; + ksp->ks_private = sc->vmm_vm; + kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); + kstat_named_setstr(&vk->vk_name, sc->vmm_name); + + for (uint_t i = 0; i < VM_MAXCPU; i++) { + ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); + + ksp = sc->vmm_kstat_vcpu[i]; + vmm_vcpu_kstats_t *vvk = ksp->ks_data; + + kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); + vvk->vvk_vcpu.value.ui32 = i; + kstat_named_init(&vvk->vvk_time_init, "time_init", + KSTAT_DATA_UINT64); + kstat_named_init(&vvk->vvk_time_run, "time_run", + KSTAT_DATA_UINT64); + kstat_named_init(&vvk->vvk_time_idle, "time_idle", + KSTAT_DATA_UINT64); + kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", + KSTAT_DATA_UINT64); + kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", + KSTAT_DATA_UINT64); + kstat_named_init(&vvk->vvk_time_sched, "time_sched", + KSTAT_DATA_UINT64); + ksp->ks_private = sc->vmm_vm; + ksp->ks_update = vmm_kstat_update_vcpu; + } + + kstat_install(sc->vmm_kstat_vm); + for (uint_t i = 0; i < VM_MAXCPU; i++) { + kstat_install(sc->vmm_kstat_vcpu[i]); + } +} + +static void +vmm_kstat_fini(vmm_softc_t *sc) +{ + ASSERT(sc->vmm_kstat_vm != NULL); + + kstat_delete(sc->vmm_kstat_vm); + sc->vmm_kstat_vm = NULL; + + for (uint_t i = 0; i < VM_MAXCPU; i++) { + ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); + + kstat_delete(sc->vmm_kstat_vcpu[i]); + sc->vmm_kstat_vcpu[i] = NULL; + } +} + +static int +vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + vmm_softc_t *sc; + + /* + * Forbid running bhyve in a 32-bit process until it has been tested and + * verified to be safe. + */ + if (curproc->p_model != DATAMODEL_LP64) { + return (EFBIG); + } + + minor = getminor(*devp); + if (minor == VMM_CTL_MINOR) { + /* + * Master control device must be opened exclusively. + */ + if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { + return (EINVAL); + } + + return (0); + } + + mutex_enter(&vmm_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmm_mtx); + return (ENXIO); + } + + sc->vmm_is_open = B_TRUE; + mutex_exit(&vmm_mtx); + + return (0); +} + +static int +vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t minor; + vmm_softc_t *sc; + boolean_t hma_release = B_FALSE; + + minor = getminor(dev); + if (minor == VMM_CTL_MINOR) + return (0); + + mutex_enter(&vmm_mtx); + sc = ddi_get_soft_state(vmm_statep, minor); + if (sc == NULL) { + mutex_exit(&vmm_mtx); + return (ENXIO); + } + + VERIFY(sc->vmm_is_open); + sc->vmm_is_open = B_FALSE; + + /* + * If this VM was destroyed while the vmm device was open, then + * clean it up now that it is closed. + */ + if (sc->vmm_flags & VMM_DESTROY) { + list_remove(&vmm_destroy_list, sc); + vmm_kstat_fini(sc); + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + hma_release = B_TRUE; + } + mutex_exit(&vmm_mtx); + + if (hma_release) + vmm_hma_release(); + + return (0); +} + +static int +vmm_is_supported(intptr_t arg) +{ + int r; + const char *msg; + + if (vmm_is_intel()) { + r = vmx_x86_supported(&msg); + } else if (vmm_is_svm()) { + /* + * HMA already ensured that the features necessary for SVM + * operation were present and online during vmm_attach(). + */ + r = 0; + } else { + r = ENXIO; + msg = "Unsupported CPU vendor"; + } + + if (r != 0 && arg != (intptr_t)NULL) { + if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) + return (EFAULT); + } + return (r); +} + +static int +vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) +{ + void *argp = (void *)arg; + + switch (cmd) { + case VMM_CREATE_VM: { + struct vm_create_req req; + + if ((md & FWRITE) == 0) { + return (EPERM); + } + if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { + return (EFAULT); + } + return (vmmdev_do_vm_create(&req, cr)); + } + case VMM_DESTROY_VM: { + struct vm_destroy_req req; + + if ((md & FWRITE) == 0) { + return (EPERM); + } + if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { + return (EFAULT); + } + return (vmmdev_do_vm_destroy(&req, cr)); + } + case VMM_VM_SUPPORTED: + return (vmm_is_supported(arg)); + case VMM_RESV_QUERY: + case VMM_RESV_ADD: + case VMM_RESV_REMOVE: + return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); + default: + break; + } + /* No other actions are legal on ctl device */ + return (ENOTTY); +} + +static int +vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + vmm_softc_t *sc; + minor_t minor; + + /* + * Forbid running bhyve in a 32-bit process until it has been tested and + * verified to be safe. + */ + if (curproc->p_model != DATAMODEL_LP64) { + return (EFBIG); + } + + /* The structs in bhyve ioctls assume a 64-bit datamodel */ + if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { + return (ENOTSUP); + } + + minor = getminor(dev); + + if (minor == VMM_CTL_MINOR) { + return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); + } + + sc = ddi_get_soft_state(vmm_statep, minor); + ASSERT(sc); + + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + + return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); +} + +static int +vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, + unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) +{ + vmm_softc_t *sc; + const minor_t minor = getminor(dev); + int err; + + if (minor == VMM_CTL_MINOR) { + return (ENODEV); + } + if (off < 0 || (off + len) <= 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (EACCES); + } + + sc = ddi_get_soft_state(vmm_statep, minor); + ASSERT(sc); + + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + + /* Grab read lock on the VM to prevent any changes to the memory map */ + vmm_read_lock(sc); + + if (off >= VM_DEVMEM_START) { + int segid; + off_t segoff; + + /* Mapping a devmem "device" */ + if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { + err = ENODEV; + } else { + err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, + addrp, prot, maxprot, flags); + } + } else { + /* Mapping a part of the guest physical space */ + err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, + maxprot, flags); + } + + vmm_read_unlock(sc); + return (err); +} + +static sdev_plugin_validate_t +vmm_sdev_validate(sdev_ctx_t ctx) +{ + const char *name = sdev_ctx_name(ctx); + vmm_softc_t *sc; + sdev_plugin_validate_t ret; + minor_t minor; + + if (sdev_ctx_vtype(ctx) != VCHR) + return (SDEV_VTOR_INVALID); + + VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); + + mutex_enter(&vmm_mtx); + if ((sc = vmm_lookup(name)) == NULL) + ret = SDEV_VTOR_INVALID; + else if (sc->vmm_minor != minor) + ret = SDEV_VTOR_STALE; + else + ret = SDEV_VTOR_VALID; + mutex_exit(&vmm_mtx); + + return (ret); +} + +static int +vmm_sdev_filldir(sdev_ctx_t ctx) +{ + vmm_softc_t *sc; + int ret; + + if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { + cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, + sdev_ctx_path(ctx), VMM_SDEV_ROOT); + return (EINVAL); + } + + mutex_enter(&vmm_mtx); + ASSERT(vmmdev_dip != NULL); + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { + ret = sdev_plugin_mknod(ctx, sc->vmm_name, + S_IFCHR | 0600, + makedevice(ddi_driver_major(vmmdev_dip), + sc->vmm_minor)); + } else { + continue; + } + if (ret != 0 && ret != EEXIST) + goto out; + } + + ret = 0; + +out: + mutex_exit(&vmm_mtx); + return (ret); +} + +/* ARGSUSED */ +static void +vmm_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static sdev_plugin_ops_t vmm_sdev_ops = { + .spo_version = SDEV_PLUGIN_VERSION, + .spo_flags = SDEV_PLUGIN_SUBDIR, + .spo_validate = vmm_sdev_validate, + .spo_filldir = vmm_sdev_filldir, + .spo_inactive = vmm_sdev_inactive +}; + +/* ARGSUSED */ +static int +vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vmmdev_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + sdev_plugin_hdl_t sph; + hma_reg_t *reg = NULL; + boolean_t vmm_loaded = B_FALSE; + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + mutex_enter(&vmmdev_mtx); + /* Ensure we are not already attached. */ + if (vmmdev_dip != NULL) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + + vmm_sol_glue_init(); + + /* + * Perform temporary HMA registration to determine if the system + * is capable. + */ + if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { + goto fail; + } else if (vmm_mod_load() != 0) { + goto fail; + } + vmm_loaded = B_TRUE; + hma_unregister(reg); + reg = NULL; + + /* Create control node. Other nodes will be created on demand. */ + if (ddi_create_minor_node(dip, "ctl", S_IFCHR, + VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { + goto fail; + } + + sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); + if (sph == (sdev_plugin_hdl_t)NULL) { + ddi_remove_minor_node(dip, NULL); + goto fail; + } + + ddi_report_dev(dip); + vmmdev_sdev_hdl = sph; + vmmdev_dip = dip; + mutex_exit(&vmmdev_mtx); + return (DDI_SUCCESS); + +fail: + if (vmm_loaded) { + VERIFY0(vmm_mod_unload()); + } + if (reg != NULL) { + hma_unregister(reg); + } + vmm_sol_glue_cleanup(); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); +} + +static int +vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + /* + * Ensure that all resources have been cleaned up. + * + * To prevent a deadlock with iommu_cleanup() we'll fail the detach if + * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our + * devinfo locked as iommu_cleanup() tries to recursively lock each + * devinfo, including our own, while holding vmmdev_mtx. + */ + if (mutex_tryenter(&vmmdev_mtx) == 0) + return (DDI_FAILURE); + + mutex_enter(&vmm_mtx); + if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { + mutex_exit(&vmm_mtx); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + mutex_exit(&vmm_mtx); + + if (!vmmr_is_empty()) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + + VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); + if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; + + /* Remove the control node. */ + ddi_remove_minor_node(dip, "ctl"); + vmmdev_dip = NULL; + + VERIFY0(vmm_mod_unload()); + VERIFY3U(vmmdev_hma_reg, ==, NULL); + vmm_sol_glue_cleanup(); + + mutex_exit(&vmmdev_mtx); + + return (DDI_SUCCESS); +} + +static struct cb_ops vmm_cb_ops = { + vmm_open, + vmm_close, + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + vmm_ioctl, + nodev, /* devmap */ + nodev, /* mmap */ + vmm_segmap, + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_DEVMAP +}; + +static struct dev_ops vmm_ops = { + DEVO_REV, + 0, + vmm_info, + nulldev, /* identify */ + nulldev, /* probe */ + vmm_attach, + vmm_detach, + nodev, /* reset */ + &vmm_cb_ops, + (struct bus_ops *)NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "bhyve vmm", + &vmm_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + sysinit(); + + mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&vmm_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + list_create(&vmm_destroy_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); + + error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); + if (error) { + return (error); + } + + vmm_zsd_init(); + vmmr_init(); + + error = mod_install(&modlinkage); + if (error) { + ddi_soft_state_fini(&vmm_statep); + vmm_zsd_fini(); + vmmr_fini(); + } + + return (error); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&modlinkage); + if (error) { + return (error); + } + + vmm_zsd_fini(); + vmmr_fini(); + + ddi_soft_state_fini(&vmm_statep); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_ept.c b/usr/src/uts/intel/io/vmm/vmm_sol_ept.c new file mode 100644 index 0000000000..fde4a030ce --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_sol_ept.c @@ -0,0 +1,139 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/atomic.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> +#include <sys/mman.h> +#include <sys/x86_archext.h> +#include <vm/hat_pte.h> + +#include <sys/vmm_gpt.h> +#include <sys/vmm_vm.h> + +#define EPT_R (1 << 0) +#define EPT_W (1 << 1) +#define EPT_X (1 << 2) +#define EPT_RWX (EPT_R | EPT_W | EPT_X) +#define EPT_LGPG (1 << 7) +#define EPT_ACCESSED (1 << 8) +#define EPT_DIRTY (1 << 9) + +#define EPT_PA_MASK (0x000ffffffffff000ull) + +#define EPT_MAX_LEVELS 4 +CTASSERT(EPT_MAX_LEVELS <= MAX_GPT_LEVEL); + +CTASSERT(EPT_R == PROT_READ); +CTASSERT(EPT_W == PROT_WRITE); +CTASSERT(EPT_X == PROT_EXEC); + +static uint_t +ept_pte_prot(uint64_t pte) +{ + return (pte & EPT_RWX); +} + +static inline uint64_t +ept_attr_to_pat(uint8_t attr) +{ + uint64_t bits = attr & 0x7; + return (bits << 3); +} + +static uint64_t +ept_map_table(uint64_t pfn) +{ + const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK; + return (paddr | EPT_RWX); +} + +static uint64_t +ept_map_page(uint64_t pfn, uint_t prot, uint8_t attr) +{ + const uint64_t paddr = pfn_to_pa(pfn) & EPT_PA_MASK; + const uint64_t pat = ept_attr_to_pat(attr); + const uint64_t rprot = prot & EPT_RWX; + return (paddr | pat | rprot); +} + +static uint64_t +ept_pte_pfn(uint64_t pte) +{ + return (mmu_btop(pte & PT_PADDR)); +} + +static bool +ept_pte_is_present(uint64_t pte) +{ + return ((pte & EPT_RWX) != 0); +} + +static uint_t +ept_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits) +{ + uint64_t pte, newpte, oldpte = 0; + + /* + * We use volatile and atomic ops here because we may be + * racing against hardware modifying these bits. + */ + VERIFY3P(entry, !=, NULL); + oldpte = *entry; + do { + pte = oldpte; + newpte = (pte & ~mask) | bits; + oldpte = atomic_cas_64(entry, pte, newpte); + } while (oldpte != pte); + + return (oldpte & mask); +} + +static uint_t +ept_reset_dirty(uint64_t *entry, bool on) +{ + return (ept_reset_bits(entry, EPT_DIRTY, + on ? (EPT_DIRTY | EPT_ACCESSED) : 0)); +} + +static uint_t +ept_reset_accessed(uint64_t *entry, bool on) +{ + return (ept_reset_bits(entry, EPT_DIRTY | EPT_ACCESSED, + on ? EPT_ACCESSED : 0)); +} + +static uint64_t +ept_get_pmtp(pfn_t root_pfn) +{ + /* TODO: enable AD tracking when required */ + return ((root_pfn << PAGESHIFT | + (EPT_MAX_LEVELS - 1) << 3 | MTRR_TYPE_WB)); +} + +vmm_pte_ops_t ept_pte_ops = { + .vpeo_map_table = ept_map_table, + .vpeo_map_page = ept_map_page, + .vpeo_pte_pfn = ept_pte_pfn, + .vpeo_pte_is_present = ept_pte_is_present, + .vpeo_pte_prot = ept_pte_prot, + .vpeo_reset_dirty = ept_reset_dirty, + .vpeo_reset_accessed = ept_reset_accessed, + .vpeo_get_pmtp = ept_get_pmtp, +}; diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_glue.c b/usr/src/uts/intel/io/vmm/vmm_sol_glue.c new file mode 100644 index 0000000000..132d5dc87f --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_sol_glue.c @@ -0,0 +1,643 @@ +/* + * Copyright (c) 2004 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/archsystm.h> +#include <sys/cpuset.h> +#include <sys/fp.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/spl.h> +#include <sys/systm.h> +#include <sys/ddidmareq.h> +#include <sys/id_space.h> +#include <sys/psm_defs.h> +#include <sys/smp_impldefs.h> +#include <sys/modhash.h> +#include <sys/hma.h> + +#include <sys/x86_archext.h> + +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmparam.h> +#include <sys/vmm_impl.h> +#include <sys/kernel.h> + +#include <vm/as.h> +#include <vm/seg_kmem.h> + +SET_DECLARE(sysinit_set, struct sysinit); + +void +sysinit(void) +{ + struct sysinit **si; + + SET_FOREACH(si, sysinit_set) + (*si)->func((*si)->data); +} + +uint8_t const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; + +void +invalidate_cache_all(void) +{ + cpuset_t cpuset; + + kpreempt_disable(); + cpuset_all_but(&cpuset, CPU->cpu_id); + xc_call((xc_arg_t)NULL, (xc_arg_t)NULL, (xc_arg_t)NULL, + CPUSET2BV(cpuset), (xc_func_t)invalidate_cache); + invalidate_cache(); + kpreempt_enable(); +} + +vm_paddr_t +vtophys(void *va) +{ + pfn_t pfn; + + /* + * Since hat_getpfnum() may block on an htable mutex, this is not at + * all safe to run from a critical_enter/kpreempt_disable context. + * The FreeBSD analog does not have the same locking constraints, so + * close attention must be paid wherever this is called. + */ + ASSERT(curthread->t_preempt == 0); + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); + ASSERT(pfn != PFN_INVALID); + return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK); +} + +int +cpusetobj_ffs(const cpuset_t *set) +{ + uint_t large, small; + + /* + * Rather than reaching into the cpuset_t ourselves, leave that task to + * cpuset_bounds(). The simplicity is worth the extra wasted work to + * find the upper bound. + */ + cpuset_bounds(set, &small, &large); + + if (small == CPUSET_NOTINSET) { + /* The FreeBSD version returns 0 if it find nothing */ + return (0); + } + + ASSERT3U(small, <=, INT_MAX); + + /* Least significant bit index starts at 1 for valid results */ + return (small + 1); +} + +struct kmem_item { + void *addr; + size_t size; +}; +static kmutex_t kmem_items_lock; + +static mod_hash_t *vmm_alloc_hash; +uint_t vmm_alloc_hash_nchains = 16381; +uint_t vmm_alloc_hash_size = PAGESIZE; + +static void +vmm_alloc_hash_valdtor(mod_hash_val_t val) +{ + struct kmem_item *i = (struct kmem_item *)val; + + kmem_free(i->addr, i->size); + kmem_free(i, sizeof (struct kmem_item)); +} + +static void +vmm_alloc_init(void) +{ + vmm_alloc_hash = mod_hash_create_ptrhash("vmm_alloc_hash", + vmm_alloc_hash_nchains, vmm_alloc_hash_valdtor, + vmm_alloc_hash_size); + + VERIFY(vmm_alloc_hash != NULL); +} + +static uint_t +vmm_alloc_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused) +{ + struct kmem_item *i = (struct kmem_item *)val; + + cmn_err(CE_PANIC, "!vmm_alloc_check: hash not empty: %p, %lu", i->addr, + i->size); + + return (MH_WALK_TERMINATE); +} + +static void +vmm_alloc_cleanup(void) +{ + mod_hash_walk(vmm_alloc_hash, vmm_alloc_check, NULL); + mod_hash_destroy_ptrhash(vmm_alloc_hash); +} + +void * +malloc(unsigned long size, struct malloc_type *mtp, int flags) +{ + void *p; + struct kmem_item *i; + int kmem_flag = KM_SLEEP; + + if (flags & M_NOWAIT) + kmem_flag = KM_NOSLEEP; + + if (flags & M_ZERO) { + p = kmem_zalloc(size, kmem_flag); + } else { + p = kmem_alloc(size, kmem_flag); + } + + if (p == NULL) + return (NULL); + + i = kmem_zalloc(sizeof (struct kmem_item), kmem_flag); + + if (i == NULL) { + kmem_free(p, size); + return (NULL); + } + + mutex_enter(&kmem_items_lock); + i->addr = p; + i->size = size; + + VERIFY(mod_hash_insert(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0); + + mutex_exit(&kmem_items_lock); + + return (p); +} + +void +free(void *addr, struct malloc_type *mtp) +{ + mutex_enter(&kmem_items_lock); + VERIFY(mod_hash_destroy(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0); + mutex_exit(&kmem_items_lock); +} + +extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int); +extern void contig_free(void *, size_t); + +void * +contigmalloc(unsigned long size, struct malloc_type *type, int flags, + vm_paddr_t low, vm_paddr_t high, unsigned long alignment, + vm_paddr_t boundary) +{ + ddi_dma_attr_t attr = { + /* Using fastboot_dma_attr as a guide... */ + DMA_ATTR_V0, + low, /* dma_attr_addr_lo */ + high, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFULL, /* dma_attr_count_max */ + alignment, /* dma_attr_align */ + 1, /* dma_attr_burstsize */ + 1, /* dma_attr_minxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_maxxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_seg: any */ + 1, /* dma_attr_sgllen */ + alignment, /* dma_attr_granular */ + 0, /* dma_attr_flags */ + }; + int cansleep = (flags & M_WAITOK); + void *result; + + ASSERT(alignment == PAGESIZE); + + result = contig_alloc((size_t)size, &attr, alignment, cansleep); + + if (result != NULL && (flags & M_ZERO) != 0) { + bzero(result, size); + } + return (result); +} + +void +contigfree(void *addr, unsigned long size, struct malloc_type *type) +{ + contig_free(addr, size); +} + +void +critical_enter(void) +{ + kpreempt_disable(); +} + +void +critical_exit(void) +{ + kpreempt_enable(); +} + + +static void +vmm_glue_callout_handler(void *arg) +{ + struct callout *c = arg; + + if (callout_active(c)) { + /* + * Record the handler fire time so that callout_pending() is + * able to detect if the callout becomes rescheduled during the + * course of the handler. + */ + c->c_fired = gethrtime(); + (c->c_func)(c->c_arg); + } +} + +void +vmm_glue_callout_init(struct callout *c, int mpsafe) +{ + cyc_handler_t hdlr; + cyc_time_t when; + + hdlr.cyh_level = CY_LOW_LEVEL; + hdlr.cyh_func = vmm_glue_callout_handler; + hdlr.cyh_arg = c; + when.cyt_when = CY_INFINITY; + when.cyt_interval = CY_INFINITY; + bzero(c, sizeof (*c)); + + mutex_enter(&cpu_lock); + c->c_cyc_id = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); +} + +void +callout_reset_hrtime(struct callout *c, hrtime_t target, void (*func)(void *), + void *arg, int flags) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + + if ((flags & C_ABSOLUTE) == 0) { + target += gethrtime(); + } + + c->c_func = func; + c->c_arg = arg; + c->c_target = target; + cyclic_reprogram(c->c_cyc_id, target); +} + +int +vmm_glue_callout_stop(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + + c->c_target = 0; + cyclic_reprogram(c->c_cyc_id, CY_INFINITY); + + return (0); +} + +int +vmm_glue_callout_drain(struct callout *c) +{ + ASSERT(c->c_cyc_id != CYCLIC_NONE); + + c->c_target = 0; + mutex_enter(&cpu_lock); + cyclic_remove(c->c_cyc_id); + c->c_cyc_id = CYCLIC_NONE; + mutex_exit(&cpu_lock); + + return (0); +} + +void +vmm_glue_callout_localize(struct callout *c) +{ + mutex_enter(&cpu_lock); + cyclic_move_here(c->c_cyc_id); + mutex_exit(&cpu_lock); +} + +/* + * Given an interval (in ns) and a frequency (in hz), calculate the number of + * "ticks" at that frequency which cover the interval. + */ +uint64_t +hrt_freq_count(hrtime_t interval, uint32_t freq) +{ + ASSERT3S(interval, >=, 0); + const uint64_t sec = interval / NANOSEC; + const uint64_t nsec = interval % NANOSEC; + + return ((sec * freq) + ((nsec * freq) / NANOSEC)); +} + +/* + * Given a frequency (in hz) and number of "ticks", calculate the interval + * (in ns) which would be covered by those ticks. + */ +hrtime_t +hrt_freq_interval(uint32_t freq, uint64_t count) +{ + const uint64_t sec = count / freq; + const uint64_t frac = count % freq; + + return ((NANOSEC * sec) + ((frac * NANOSEC) / freq)); +} + + +uint_t cpu_high; /* Highest arg to CPUID */ +uint_t cpu_exthigh; /* Highest arg to extended CPUID */ +uint_t cpu_id; /* Stepping ID */ +char cpu_vendor[20]; /* CPU Origin code */ + +static void +vmm_cpuid_init(void) +{ + uint_t regs[4]; + + do_cpuid(0, regs); + cpu_high = regs[0]; + ((uint_t *)&cpu_vendor)[0] = regs[1]; + ((uint_t *)&cpu_vendor)[1] = regs[3]; + ((uint_t *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + do_cpuid(1, regs); + cpu_id = regs[0]; + + do_cpuid(0x80000000, regs); + cpu_exthigh = regs[0]; +} + +void +vmm_sol_glue_init(void) +{ + vmm_alloc_init(); + vmm_cpuid_init(); +} + +void +vmm_sol_glue_cleanup(void) +{ + vmm_alloc_cleanup(); +} + + +/* From FreeBSD's sys/kern/subr_clock.c */ + +/*- + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + */ + +#include <sys/clock.h> + +/* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static int +leapyear(int year) +{ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } + } + return (rv); +} + +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) +{ + int i, year, days; + + year = ct->year; + +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ct_to_ts("); + print_ct(ct); + printf(")"); + } +#endif + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + (sizeof (time_t) == 4 && year > 2037)) { /* time_t overflow */ +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = EINVAL\n"); +#endif + return (EINVAL); + } + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 + + ct->sec; + ts->tv_nsec = ct->nsec; + +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec); +#endif + return (0); +} + +void +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ts_to_ct(%ld.%09ld) = ", + (long)ts->tv_sec, (long)ts->tv_nsec); + print_ct(ct); + printf("\n"); + } +#endif +} + +/* Equivalent to the FreeBSD rdtsc(), but with any necessary per-cpu offset */ +uint64_t +rdtsc_offset(void) +{ + /* + * The timestamp logic will decide if a delta need be applied to the + * unscaled hrtime reading (effectively rdtsc), but we do require it be + * backed by the TSC itself. + */ + extern hrtime_t (*gethrtimeunscaledf)(void); + extern hrtime_t tsc_gethrtimeunscaled(void); + extern hrtime_t tsc_gethrtimeunscaled_delta(void); + + ASSERT(*gethrtimeunscaledf == tsc_gethrtimeunscaled || + *gethrtimeunscaledf == tsc_gethrtimeunscaled_delta); + return ((uint64_t)gethrtimeunscaledf()); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_rvi.c b/usr/src/uts/intel/io/vmm/vmm_sol_rvi.c new file mode 100644 index 0000000000..8b45782d25 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_sol_rvi.c @@ -0,0 +1,157 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/atomic.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> +#include <sys/mach_mmu.h> +#include <sys/mman.h> +#include <sys/x86_archext.h> +#include <vm/hat_pte.h> + +#include <sys/vmm_gpt.h> +#include <sys/vmm_vm.h> + +static inline uint64_t +rvi_prot(uint_t prot) +{ + uint64_t bits; + + bits = 0; + if ((prot & PROT_WRITE) != 0) + bits |= PT_WRITABLE; + if ((prot & PROT_EXEC) == 0) + bits |= PT_NX; + + return (bits); +} + +static uint_t +rvi_pte_prot(uint64_t pte) +{ + uint_t prot; + + if ((pte & PT_VALID) == 0) + return (0); + + prot = PROT_READ; + if ((pte & PT_NX) == 0) + prot |= PROT_EXEC; + if ((pte & PT_WRITABLE) != 0) + prot |= PROT_WRITE; + + return (prot); +} + +/* Make sure that PAT indexes line up as expected */ +CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); +CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); + +static inline uint64_t +rvi_attr_to_pat(uint8_t attr) +{ + + if (attr == MTRR_TYPE_UC) + return (PT_NOCACHE | PT_WRITETHRU); + if (attr == MTRR_TYPE_WB) + return (0); + + panic("unexpected memattr %x", attr); +} + +static uint64_t +rvi_map_table(uint64_t pfn) +{ + const uint64_t paddr = pfn_to_pa(pfn); + const uint64_t flags = PT_USER | PT_REF | PT_VALID; + const uint64_t pat = rvi_attr_to_pat(MTRR_TYPE_WB); + const uint64_t rprot = PT_WRITABLE; + return (paddr | flags | pat | rprot); +} + +static uint64_t +rvi_map_page(uint64_t pfn, uint_t prot, uint8_t attr) +{ + const uint64_t paddr = pfn_to_pa(pfn); + const uint64_t flags = PT_USER | PT_REF | PT_VALID; + const uint64_t pat = rvi_attr_to_pat(attr); + const uint64_t rprot = rvi_prot(prot); + return (paddr | flags | pat | rprot); +} + +static pfn_t +rvi_pte_pfn(uint64_t pte) +{ + return (mmu_btop(pte & PT_PADDR)); +} + +static bool +rvi_pte_is_present(uint64_t pte) +{ + return ((pte & PT_VALID) == PT_VALID); +} + +static uint_t +rvi_reset_bits(volatile uint64_t *entry, uint64_t mask, uint64_t bits) +{ + uint64_t pte, newpte, oldpte = 0; + + /* + * We use volatile and atomic ops here because we may be + * racing against hardware modifying these bits. + */ + VERIFY3P(entry, !=, NULL); + oldpte = *entry; + do { + pte = oldpte; + newpte = (pte & ~mask) | bits; + oldpte = atomic_cas_64(entry, pte, newpte); + } while (oldpte != pte); + + return (oldpte & mask); +} + +static uint_t +rvi_reset_dirty(uint64_t *entry, bool on) +{ + return (rvi_reset_bits(entry, PT_MOD, on ? (PT_MOD | PT_REF) : 0)); +} + +static uint_t +rvi_reset_accessed(uint64_t *entry, bool on) +{ + return (rvi_reset_bits(entry, (PT_MOD | PT_REF), on ? PT_REF : 0)); +} + +static uint64_t +rvi_get_pmtp(pfn_t root_pfn) +{ + return (root_pfn << PAGESHIFT); +} + +vmm_pte_ops_t rvi_pte_ops = { + .vpeo_map_table = rvi_map_table, + .vpeo_map_page = rvi_map_page, + .vpeo_pte_pfn = rvi_pte_pfn, + .vpeo_pte_is_present = rvi_pte_is_present, + .vpeo_pte_prot = rvi_pte_prot, + .vpeo_reset_dirty = rvi_reset_dirty, + .vpeo_reset_accessed = rvi_reset_accessed, + .vpeo_get_pmtp = rvi_get_pmtp, +}; diff --git a/usr/src/uts/intel/io/vmm/vmm_stat.c b/usr/src/uts/intel/io/vmm/vmm_stat.c new file mode 100644 index 0000000000..da38bb7de5 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_stat.c @@ -0,0 +1,171 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/vmm.h> +#include "vmm_util.h" +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof (uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) + return; + + if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_svm()) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_MMIO_EMUL, "vm exits for mmio emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); +VMM_STAT(VMEXIT_RUN_STATE, "number of vm exits due to run_state change"); diff --git a/usr/src/uts/intel/io/vmm/vmm_stat.h b/usr/src/uts/intel/io/vmm/vmm_stat.h new file mode 100644 index 0000000000..2975a4a914 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_stat.h @@ -0,0 +1,171 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +#include <machine/vmm.h> + +struct vm; + +#ifdef __FreeBSD__ +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#else +#define MAX_VMM_STAT_ELEMS (64 + VM_MAXCPU) /* arbitrary */ +#endif + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static __inline void +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static __inline void +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static __inline void +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static __inline void +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_MMIO_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +VMM_STAT_DECLARE(VMEXIT_REQIDLE); +VMM_STAT_DECLARE(VMEXIT_RUN_STATE); +#endif diff --git a/usr/src/uts/intel/io/vmm/vmm_support.s b/usr/src/uts/intel/io/vmm/vmm_support.s new file mode 100644 index 0000000000..4bc973468a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_support.s @@ -0,0 +1,55 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* + * %rdi = trapno + * + * This variant is for any explicit exception injection that we need: in this + * case, we can't just, for example, do a direct "int $2", as that will then + * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame. + * Both NMIs and MCEs don't push an 'err' into the frame. + */ +ENTRY_NP(vmm_call_trap) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .trap_iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + cli + cmpq $T_NMIFLT, %rdi + je nmiint + cmpq $T_MCE, %rdi + je mcetrap + + pushq %rdi /* save our bad trapno... */ + leaq __vmm_call_bad_trap(%rip), %rdi + xorl %eax, %eax + call panic + /*NOTREACHED*/ + +.trap_iret_dest: + popq %rbp + ret +SET_SIZE(vmm_call_trap) + +__vmm_call_bad_trap: + .string "bad trapno for vmm_call_trap()" diff --git a/usr/src/uts/intel/io/vmm/vmm_util.c b/usr/src/uts/intel/io/vmm/vmm_util.c new file mode 100644 index 0000000000..05dfd08aaa --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_util.c @@ -0,0 +1,83 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2013 Pluribus Networks Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +bool +vmm_is_intel(void) +{ + + return (strcmp(cpu_vendor, "GenuineIntel") == 0); +} + +bool +vmm_is_svm(void) +{ + return (strcmp(cpu_vendor, "AuthenticAMD") == 0 || + strcmp(cpu_vendor, "HygonGenuine") == 0); +} + +bool +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (true); + } + return (false); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_util.h b/usr/src/uts/intel/io/vmm/vmm_util.h new file mode 100644 index 0000000000..ff93ce5733 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_util.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +bool vmm_is_intel(void); +bool vmm_is_svm(void); +bool vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/usr/src/uts/intel/io/vmm/vmm_vm.c b/usr/src/uts/intel/io/vmm/vmm_vm.c new file mode 100644 index 0000000000..c87c8a62d3 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_vm.c @@ -0,0 +1,1467 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/list.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/vmsystm.h> +#include <sys/malloc.h> +#include <sys/x86_archext.h> +#include <vm/as.h> +#include <vm/hat_i86.h> +#include <vm/seg_vn.h> +#include <vm/seg_kmem.h> + +#include <sys/vmm_vm.h> +#include <sys/seg_vmm.h> +#include <sys/vmm_kernel.h> +#include <sys/vmm_reservoir.h> +#include <sys/vmm_gpt.h> + + +/* + * VMM Virtual Memory + * + * History + * + * When bhyve was ported to illumos, one significant hole was handling guest + * memory and memory accesses. In the original Pluribus port, bhyve itself + * manually handled the EPT structures for guest memory. The updated sources + * (from FreeBSD 11) took a different approach, using the native FreeBSD VM + * system for memory allocations and management of the EPT structures. Keeping + * source differences to a minimum was a priority, so illumos-bhyve implemented + * a makeshift "VM shim" which exposed the bare minimum of those interfaces to + * boot and run guests. + * + * While the VM shim was successful in getting illumos-bhyve to a functional + * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the + * compatibility interfaces made it awkward to use. As source differences with + * the upstream kernel code became less of a concern, and upcoming features + * (such as live migration) would demand more of those VM interfaces, it became + * clear that an overhaul was prudent. + * + * Design + * + * The new VM system for bhyve retains a number of the same concepts as what it + * replaces: + * + * - `vmspace_t` is the top-level entity for a guest memory space + * - `vm_object_t` represents a memory object which can be mapped into a vmspace + * - `vm_page_t` represents a page hold within a given vmspace, providing access + * to the underlying memory page + * + * Unlike the old code, where most of the involved structures were exposed via + * public definitions, this replacement VM interface keeps all involved + * structures opaque to consumers. Furthermore, there is a clear delineation + * between infrequent administrative operations (such as mapping/unmapping + * regions) and common data-path operations (attempting a page hold at a given + * guest-physical address). Those administrative operations are performed + * directly against the vmspace, whereas the data-path operations are performed + * through a `vm_client_t` handle. That VM client abstraction is meant to + * reduce contention and overhead for frequent access operations and provide + * debugging insight into how different subcomponents are accessing the vmspace. + * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv + * interface) and each VMM userspace segment mapping. + * + * Exclusion + * + * Making changes to the vmspace (such as mapping or unmapping regions) requires + * other accessors be excluded while the change is underway to prevent them from + * observing invalid intermediate states. A simple approach could use a mutex + * or rwlock to achieve this, but that risks contention when the rate of access + * to the vmspace is high. + * + * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion + * at a per-vm_client_t basis. While this raises the cost for vmspace changes, + * it means that the much more common page accesses through the vm_client can + * normally proceed unimpeded and independently. + * + * When a change to the vmspace is required, the caller will put the vmspace in + * a 'hold' state, iterating over all associated vm_client instances, waiting + * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before + * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on + * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) + * will block until the hold condition is cleared. Once the hold is asserted + * for all clients, the vmspace change can proceed with confidence. Upon + * completion of that operation, VCS_HOLD is cleared from the clients, and they + * are released to resume vmspace accesses. + * + * vCPU Consumers + * + * Access to the vmspace for vCPUs running in guest context is different from + * emulation-related vm_client activity: they solely rely on the contents of the + * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude + * client access is not feasible when entering guest context, since interrupts + * are disabled, making it impossible to block entry. This is not a concern as + * long as vmspace modifications never place the page tables in invalid states + * (either intermediate, or final). The vm_client hold mechanism does provide + * the means to IPI vCPU consumers which will trigger a notification once they + * report their exit from guest context. This can be used to ensure that page + * table modifications are made visible to those vCPUs within a certain + * time frame. + */ + +typedef struct vmspace_mapping { + list_node_t vmsm_node; + vm_object_t *vmsm_object; /* object backing this mapping */ + uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ + size_t vmsm_len; /* length (in bytes) of mapping */ + off_t vmsm_offset; /* byte offset into object */ + uint_t vmsm_prot; +} vmspace_mapping_t; + +#define VMSM_OFFSET(vmsm, addr) ( \ + (vmsm)->vmsm_offset + \ + ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) + +typedef enum vm_client_state { + VCS_IDLE = 0, + /* currently accessing vmspace for client operation (hold or fault) */ + VCS_ACTIVE = (1 << 0), + /* client hold requested/asserted */ + VCS_HOLD = (1 << 1), + /* vCPU is accessing page tables in guest context */ + VCS_ON_CPU = (1 << 2), + /* client has been orphaned (no more access to vmspace) */ + VCS_ORPHANED = (1 << 3), + /* client undergoing destroy operation */ + VCS_DESTROY = (1 << 4), +} vm_client_state_t; + +struct vmspace { + kmutex_t vms_lock; + kcondvar_t vms_cv; + bool vms_held; + uintptr_t vms_size; /* immutable after creation */ + + /* (nested) page table state */ + vmm_gpt_t *vms_gpt; + uint64_t vms_pt_gen; + uint64_t vms_pages_mapped; + bool vms_track_dirty; + + list_t vms_maplist; + list_t vms_clients; +}; + +struct vm_client { + vmspace_t *vmc_space; + list_node_t vmc_node; + + kmutex_t vmc_lock; + kcondvar_t vmc_cv; + vm_client_state_t vmc_state; + int vmc_cpu_active; + uint64_t vmc_cpu_gen; + bool vmc_track_dirty; + vmc_inval_cb_t vmc_inval_func; + void *vmc_inval_data; + + list_t vmc_held_pages; +}; + +typedef enum vm_object_type { + VMOT_NONE, + VMOT_MEM, + VMOT_MMIO, +} vm_object_type_t; + +struct vm_object { + uint_t vmo_refcnt; /* manipulated with atomic ops */ + + /* Fields below are fixed at creation time */ + vm_object_type_t vmo_type; + size_t vmo_size; + void *vmo_data; + uint8_t vmo_attr; +}; + +struct vm_page { + vm_client_t *vmp_client; + list_node_t vmp_node; + vm_page_t *vmp_chain; + uintptr_t vmp_gpa; + pfn_t vmp_pfn; + uint64_t *vmp_ptep; + vm_object_t *vmp_obj_ref; + int vmp_prot; +}; + +static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); +static void vmspace_hold_enter(vmspace_t *); +static void vmspace_hold_exit(vmspace_t *, bool); +static void vmc_space_hold(vm_client_t *); +static void vmc_space_release(vm_client_t *, bool); +static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); +static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); +static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); + + +/* + * Create a new vmspace with a maximum address of `end`. + */ +vmspace_t * +vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) +{ + vmspace_t *vms; + const uintptr_t size = end + 1; + + /* + * This whole mess is built on the assumption that a 64-bit address + * space is available to work with for the various pagetable tricks. + */ + VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && + size <= (uintptr_t)USERLIMIT); + + vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); + vms->vms_size = size; + list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), + offsetof(vmspace_mapping_t, vmsm_node)); + list_create(&vms->vms_clients, sizeof (vm_client_t), + offsetof(vm_client_t, vmc_node)); + + vms->vms_gpt = vmm_gpt_alloc(pte_ops); + vms->vms_pt_gen = 1; + vms->vms_track_dirty = track_dirty; + + return (vms); +} + +/* + * Destroy a vmspace. All regions in the space must be unmapped. Any remaining + * clients will be orphaned. + */ +void +vmspace_destroy(vmspace_t *vms) +{ + mutex_enter(&vms->vms_lock); + VERIFY(list_is_empty(&vms->vms_maplist)); + + if (!list_is_empty(&vms->vms_clients)) { + vm_client_t *vmc = list_head(&vms->vms_clients); + while (vmc != NULL) { + vmc = vmc_space_orphan(vmc, vms); + } + /* + * Wait for any clients which were in the process of destroying + * themselves to disappear. + */ + while (!list_is_empty(&vms->vms_clients)) { + cv_wait(&vms->vms_cv, &vms->vms_lock); + } + } + VERIFY(list_is_empty(&vms->vms_clients)); + + vmm_gpt_free(vms->vms_gpt); + mutex_exit(&vms->vms_lock); + + mutex_destroy(&vms->vms_lock); + cv_destroy(&vms->vms_cv); + list_destroy(&vms->vms_maplist); + list_destroy(&vms->vms_clients); + + kmem_free(vms, sizeof (*vms)); +} + +/* + * Retrieve the count of resident (mapped into the page tables) pages. + */ +uint64_t +vmspace_resident_count(vmspace_t *vms) +{ + return (vms->vms_pages_mapped); +} + +void +vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) +{ + /* + * Accumulate dirty bits into the given bit vector. Note that this + * races both against hardware writes from running vCPUs and + * reflections from userspace. + * + * Called from a userspace-visible ioctl, this depends on the VM + * instance being read-locked to prevent vmspace_map/vmspace_unmap + * operations from changing the page tables during the walk. + */ + for (size_t offset = 0; offset < len; offset += PAGESIZE) { + bool bit = false; + uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); + if (entry != NULL) + bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); + uint64_t pfn_offset = offset >> PAGESHIFT; + size_t bit_offset = pfn_offset / 8; + size_t bit_index = pfn_offset % 8; + bitmap[bit_offset] |= (bit << bit_index); + } + + /* + * Now invalidate those bits and shoot down address spaces that + * may have them cached. + */ + vmspace_hold_enter(vms); + vms->vms_pt_gen++; + for (vm_client_t *vmc = list_head(&vms->vms_clients); + vmc != NULL; + vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); + } + vmspace_hold_exit(vms, true); +} + +static pfn_t +vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) +{ + vmmr_region_t *region; + pfn_t pfn; + + ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); + + region = vmo->vmo_data; + pfn = vmmr_region_pfn_at(region, off); + + return (pfn); +} + +static pfn_t +vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) +{ + pfn_t pfn; + + ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); + ASSERT3P(vmo->vmo_data, !=, NULL); + ASSERT3U(off, <, vmo->vmo_size); + + pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; + + return (pfn); +} + +/* + * Allocate a VM object backed by VMM reservoir memory. + */ +vm_object_t * +vm_object_mem_allocate(size_t size, bool transient) +{ + int err; + vmmr_region_t *region = NULL; + vm_object_t *vmo; + + ASSERT3U(size, !=, 0); + ASSERT3U(size & PAGEOFFSET, ==, 0); + + err = vmmr_alloc(size, transient, ®ion); + if (err != 0) { + return (NULL); + } + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = VMOT_MEM; + vmo->vmo_size = size; + vmo->vmo_attr = MTRR_TYPE_WB; + vmo->vmo_data = region; + vmo->vmo_refcnt = 1; + + return (vmo); +} + +static vm_object_t * +vm_object_mmio_allocate(size_t size, uintptr_t hpa) +{ + vm_object_t *vmo; + + ASSERT3U(size, !=, 0); + ASSERT3U(size & PAGEOFFSET, ==, 0); + ASSERT3U(hpa & PAGEOFFSET, ==, 0); + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = VMOT_MMIO; + vmo->vmo_size = size; + vmo->vmo_attr = MTRR_TYPE_UC; + vmo->vmo_data = (void *)hpa; + vmo->vmo_refcnt = 1; + + return (vmo); +} + +/* + * Allocate a VM object backed by an existing range of physical memory. + */ +vm_object_t * +vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) +{ + int error; + vm_object_t *obj; + + obj = vm_object_mmio_allocate(len, hpa); + if (obj != NULL) { + error = vmspace_map(vmspace, obj, 0, gpa, len, + PROT_READ | PROT_WRITE); + if (error != 0) { + vm_object_release(obj); + obj = NULL; + } + } + + return (obj); +} + +/* + * Release a vm_object reference + */ +void +vm_object_release(vm_object_t *vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); + /* underflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, UINT_MAX); + if (ref != 0) { + return; + } + + switch (vmo->vmo_type) { + case VMOT_MEM: + vmmr_free((vmmr_region_t *)vmo->vmo_data); + break; + case VMOT_MMIO: + break; + default: + panic("unexpected object type %u", vmo->vmo_type); + break; + } + + vmo->vmo_data = NULL; + vmo->vmo_size = 0; + kmem_free(vmo, sizeof (*vmo)); +} + +/* + * Increase refcount for vm_object reference + */ +void +vm_object_reference(vm_object_t *vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); + /* overflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, 0); +} + +/* + * Get the host-physical PFN for a given offset into a vm_object. + * + * The provided `off` must be within the allocated size of the vm_object. + */ +pfn_t +vm_object_pfn(vm_object_t *vmo, uintptr_t off) +{ + const uintptr_t aligned_off = off & PAGEMASK; + + switch (vmo->vmo_type) { + case VMOT_MEM: + return (vm_object_pager_reservoir(vmo, aligned_off)); + case VMOT_MMIO: + return (vm_object_pager_mmio(vmo, aligned_off)); + case VMOT_NONE: + break; + } + panic("unexpected object type %u", vmo->vmo_type); +} + +static vmspace_mapping_t * +vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT3U(addr, <=, range_end); + + if (addr >= vms->vms_size) { + return (NULL); + } + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if (addr >= vmsm->vmsm_addr && addr < seg_end) { + if (range_end <= seg_end) { + return (vmsm); + } else { + return (NULL); + } + } + } + return (NULL); +} + +/* + * Check to see if any mappings reside within [addr, addr + size) span in the + * vmspace, returning true if that span is indeed empty. + */ +static bool +vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size - 1; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(size > 0); + + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; + + /* + * The two ranges do not overlap if the start of either of + * them is after the end of the other. + */ + if (vmsm->vmsm_addr > range_end || addr > seg_end) + continue; + return (false); + } + return (true); +} + +static void +vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) +{ + list_t *ml = &vms->vms_maplist; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(vms->vms_held); + + list_remove(ml, vmsm); + vm_object_release(vmsm->vmsm_object); + kmem_free(vmsm, sizeof (*vmsm)); +} + +/* + * Enter a hold state on the vmspace. This ensures that all VM clients + * associated with the vmspace are excluded from establishing new page holds, + * or any other actions which would require accessing vmspace state subject to + * potential change. + * + * Returns with vmspace_t`vms_lock held. + */ +static void +vmspace_hold_enter(vmspace_t *vms) +{ + mutex_enter(&vms->vms_lock); + VERIFY(!vms->vms_held); + + vm_client_t *vmc = list_head(&vms->vms_clients); + for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_hold(vmc); + } + vms->vms_held = true; +} + +/* + * Exit a hold state on the vmspace. This releases all VM clients associated + * with the vmspace to be able to establish new page holds, and partake in other + * actions which require accessing changed vmspace state. If `kick_on_cpu` is + * true, then any CPUs actively using the page tables will be IPIed, and the + * call will block until they have acknowledged being ready to use the latest + * state of the tables. + * + * Requires vmspace_t`vms_lock be held, which is released as part of the call. + */ +static void +vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) +{ + ASSERT(MUTEX_HELD(&vms->vms_lock)); + VERIFY(vms->vms_held); + + vm_client_t *vmc = list_head(&vms->vms_clients); + for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_release(vmc, kick_on_cpu); + } + vms->vms_held = false; + mutex_exit(&vms->vms_lock); +} + +/* + * Attempt to map a vm_object span into the vmspace. + * + * Requirements: + * - `obj_off`, `addr`, and `len` must be page-aligned + * - `obj_off` cannot be greater than the allocated size of the object + * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated + * size of the object + * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address + * of the vmspace + */ +int +vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, + size_t len, uint8_t prot) +{ + vmspace_mapping_t *vmsm; + int res = 0; + + if (len == 0 || (addr + len) < addr || + obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { + return (EINVAL); + } + if ((addr + len) >= vms->vms_size) { + return (ENOMEM); + } + + vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); + + vmspace_hold_enter(vms); + if (!vm_mapping_gap(vms, addr, len)) { + kmem_free(vmsm, sizeof (*vmsm)); + res = ENOMEM; + } else { + vmsm->vmsm_object = vmo; + vmsm->vmsm_addr = addr; + vmsm->vmsm_len = len; + vmsm->vmsm_offset = (off_t)obj_off; + vmsm->vmsm_prot = prot; + list_insert_tail(&vms->vms_maplist, vmsm); + + /* + * Make sure the GPT has tables ready for leaf entries across + * the entire new mapping. + */ + vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); + } + vmspace_hold_exit(vms, false); + return (res); +} + +/* + * Unmap a region of the vmspace. + * + * Presently the [start, end) span must equal a region previously mapped by a + * call to vmspace_map(). + */ +int +vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) +{ + const size_t size = (size_t)(end - start); + vmspace_mapping_t *vmsm; + vm_client_t *vmc; + uint64_t gen = 0; + + ASSERT(start < end); + + vmspace_hold_enter(vms); + /* expect to match existing mapping exactly */ + if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || + vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { + vmspace_hold_exit(vms, false); + return (ENOENT); + } + + /* Prepare clients (and their held pages) for the unmap. */ + for (vmc = list_head(&vms->vms_clients); vmc != NULL; + vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); + } + + /* Clear all PTEs for region */ + if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { + vms->vms_pt_gen++; + gen = vms->vms_pt_gen; + } + /* ... and the intermediate (directory) PTEs as well */ + vmm_gpt_vacate_region(vms->vms_gpt, start, end); + + /* + * If pages were actually unmapped from the GPT, provide clients with + * an invalidation notice. + */ + if (gen != 0) { + for (vmc = list_head(&vms->vms_clients); vmc != NULL; + vmc = list_next(&vms->vms_clients, vmc)) { + vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); + } + } + + vm_mapping_remove(vms, vmsm); + vmspace_hold_exit(vms, true); + return (0); +} + +static int +vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, + uint64_t **ptepp) +{ + vmm_gpt_t *gpt = vms->vms_gpt; + uint64_t *entries[MAX_GPT_LEVEL], *leaf; + pfn_t pfn = PFN_INVALID; + uint_t prot; + + ASSERT0(gpa & PAGEOFFSET); + ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); + + vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); + leaf = entries[LEVEL1]; + if (leaf == NULL) { + /* + * Since we populated the intermediate tables for any regions + * mapped in the GPT, an empty leaf entry indicates there is no + * mapping, populated or not, at this GPT. + */ + return (FC_NOMAP); + } + + if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { + if ((req_prot & prot) != req_prot) { + return (FC_PROT); + } + } else { + vmspace_mapping_t *vmsm; + vm_object_t *vmo; + + vmsm = vm_mapping_find(vms, gpa, PAGESIZE); + if (vmsm == NULL) { + return (FC_NOMAP); + } + + if ((req_prot & vmsm->vmsm_prot) != req_prot) { + return (FC_PROT); + } + vmo = vmsm->vmsm_object; + pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); + VERIFY(pfn != PFN_INVALID); + + if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, + vmo->vmo_attr)) { + atomic_inc_64(&vms->vms_pages_mapped); + } + } + + ASSERT(pfn != PFN_INVALID && leaf != NULL); + if (pfnp != NULL) { + *pfnp = pfn; + } + if (ptepp != NULL) { + *ptepp = leaf; + } + return (0); +} + +/* + * Populate (make resident in the page tables) a region of the vmspace. + * + * Presently the [start, end) span must equal a region previously mapped by a + * call to vmspace_map(). + */ +int +vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) +{ + const size_t size = end - start; + vmspace_mapping_t *vmsm; + + mutex_enter(&vms->vms_lock); + + /* For the time being, only exact-match mappings are expected */ + if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + + vm_object_t *vmo = vmsm->vmsm_object; + const int prot = vmsm->vmsm_prot; + const uint8_t attr = vmo->vmo_attr; + size_t populated = 0; + for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { + const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); + VERIFY(pfn != PFN_INVALID); + + if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { + populated++; + } + } + atomic_add_64(&vms->vms_pages_mapped, populated); + + mutex_exit(&vms->vms_lock); + return (0); +} + +/* + * Allocate a client from a given vmspace. + */ +vm_client_t * +vmspace_client_alloc(vmspace_t *vms) +{ + vm_client_t *vmc; + + vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); + vmc->vmc_space = vms; + mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); + vmc->vmc_state = VCS_IDLE; + vmc->vmc_cpu_active = -1; + list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), + offsetof(vm_page_t, vmp_node)); + vmc->vmc_track_dirty = vms->vms_track_dirty; + + mutex_enter(&vms->vms_lock); + list_insert_tail(&vms->vms_clients, vmc); + mutex_exit(&vms->vms_lock); + + return (vmc); +} + +/* + * Get the nested page table root pointer (EPTP/NCR3) value. + */ +uint64_t +vmspace_table_root(vmspace_t *vms) +{ + return (vmm_gpt_get_pmtp(vms->vms_gpt)); +} + +/* + * Get the current generation number of the nested page table. + */ +uint64_t +vmspace_table_gen(vmspace_t *vms) +{ + return (vms->vms_pt_gen); +} + +/* + * Mark a vm_client as active. This will block if/while the client is held by + * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will + * fail if the vm_client has been orphaned. + */ +static int +vmc_activate(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY0(vmc->vmc_state & VCS_ACTIVE); + if ((vmc->vmc_state & VCS_ORPHANED) != 0) { + mutex_exit(&vmc->vmc_lock); + return (ENXIO); + } + while ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + vmc->vmc_state |= VCS_ACTIVE; + return (0); +} + +/* + * Mark a vm_client as no longer active. It must be called with + * vm_client_t`vmc_lock already held, and will return with it released. + */ +static void +vmc_deactivate(vm_client_t *vmc) +{ + ASSERT(MUTEX_HELD(&vmc->vmc_lock)); + VERIFY(vmc->vmc_state & VCS_ACTIVE); + + vmc->vmc_state ^= VCS_ACTIVE; + if ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_broadcast(&vmc->vmc_cv); + } + mutex_exit(&vmc->vmc_lock); +} + +/* + * Indicate that a CPU will be utilizing the nested page tables through this VM + * client. Interrupts (and/or the GIF) are expected to be disabled when calling + * this function. Returns the generation number of the nested page table (to be + * used for TLB invalidations). + */ +uint64_t +vmc_table_enter(vm_client_t *vmc) +{ + vmspace_t *vms = vmc->vmc_space; + uint64_t gen; + + ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); + ASSERT3S(vmc->vmc_cpu_active, ==, -1); + + /* + * Since the NPT activation occurs with interrupts disabled, this must + * be done without taking vmc_lock like normal. + */ + gen = vms->vms_pt_gen; + vmc->vmc_cpu_active = CPU->cpu_id; + vmc->vmc_cpu_gen = gen; + atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); + + return (gen); +} + +/* + * Indicate that this VM client is not longer (directly) using the underlying + * page tables. Interrupts (and/or the GIF) must be enabled prior to calling + * this function. + */ +void +vmc_table_exit(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + + ASSERT(vmc->vmc_state & VCS_ON_CPU); + vmc->vmc_state ^= VCS_ON_CPU; + vmc->vmc_cpu_active = -1; + if ((vmc->vmc_state & VCS_HOLD) != 0) { + cv_broadcast(&vmc->vmc_cv); + } + + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_hold(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY0(vmc->vmc_state & VCS_HOLD); + + /* + * Because vmc_table_enter() alters vmc_state from a context where + * interrupts are disabled, it cannot pay heed to vmc_lock, so setting + * VMC_HOLD must be done atomically here. + */ + atomic_or_uint(&vmc->vmc_state, VCS_HOLD); + + /* Wait for client to go inactive */ + while ((vmc->vmc_state & VCS_ACTIVE) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + + if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { + poke_cpu(vmc->vmc_cpu_active); + + while ((vmc->vmc_state & VCS_ON_CPU) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + } + + /* + * Because vmc_table_enter() alters vmc_state from a context where + * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing + * VMC_HOLD must be done atomically here. + */ + atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); + cv_broadcast(&vmc->vmc_cv); + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, + uint64_t gen) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + if ((vmc->vmc_state & VCS_ON_CPU) != 0) { + /* + * Wait for clients using an old generation of the page tables + * to exit guest context, where they subsequently flush the TLB + * for the new generation. + */ + if (vmc->vmc_cpu_gen < gen) { + poke_cpu(vmc->vmc_cpu_active); + + while ((vmc->vmc_state & VCS_ON_CPU) != 0) { + cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); + } + } + } + if (vmc->vmc_inval_func != NULL) { + vmc_inval_cb_t func = vmc->vmc_inval_func; + void *data = vmc->vmc_inval_data; + + /* + * Perform the actual invalidation call outside vmc_lock to + * avoid lock ordering issues in the consumer. Since the client + * is under VCS_HOLD, this is safe. + */ + mutex_exit(&vmc->vmc_lock); + func(data, addr, size); + mutex_enter(&vmc->vmc_lock); + } + mutex_exit(&vmc->vmc_lock); +} + +static void +vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, + vm_object_t *vmo) +{ + mutex_enter(&vmc->vmc_lock); + VERIFY(vmc->vmc_state & VCS_HOLD); + + /* + * With the current vCPU exclusion invariants in place, we do not expect + * a vCPU to be in guest context during an unmap. + */ + VERIFY0(vmc->vmc_state & VCS_ON_CPU); + + /* + * Any holds against the unmapped region need to establish their own + * reference to the underlying object to avoid a potential + * use-after-free. + */ + for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); + vmp != NULL; + vmp = list_next(&vmc->vmc_held_pages, vmc)) { + if (vmp->vmp_gpa < addr || + vmp->vmp_gpa >= (addr + size)) { + /* Hold outside region in question */ + continue; + } + if (vmp->vmp_obj_ref == NULL) { + vm_object_reference(vmo); + vmp->vmp_obj_ref = vmo; + /* For an unmapped region, PTE is now meaningless */ + vmp->vmp_ptep = NULL; + } else { + /* + * Object could have gone through cycle of + * unmap-map-unmap before the hold was released. + */ + VERIFY3P(vmp->vmp_ptep, ==, NULL); + } + } + mutex_exit(&vmc->vmc_lock); +} + +static vm_client_t * +vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) +{ + vm_client_t *next; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + mutex_enter(&vmc->vmc_lock); + VERIFY3P(vmc->vmc_space, ==, vms); + VERIFY0(vmc->vmc_state & VCS_ORPHANED); + if (vmc->vmc_state & VCS_DESTROY) { + /* + * This vm_client is currently undergoing destruction, so it + * does not need to be orphaned. Let it proceed with its own + * clean-up task. + */ + next = list_next(&vms->vms_clients, vmc); + } else { + /* + * Clients are only orphaned when the containing vmspace is + * being torn down. All mappings from the vmspace should + * already be gone, meaning any remaining held pages should have + * direct references to the object. + */ + for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); + vmp != NULL; + vmp = list_next(&vmc->vmc_held_pages, vmp)) { + ASSERT3P(vmp->vmp_ptep, ==, NULL); + ASSERT3P(vmp->vmp_obj_ref, !=, NULL); + } + + /* + * After this point, the client will be orphaned, unable to + * establish new page holds (or access any vmspace-related + * resources) and is in charge of cleaning up after itself. + */ + vmc->vmc_state |= VCS_ORPHANED; + next = list_next(&vms->vms_clients, vmc); + list_remove(&vms->vms_clients, vmc); + vmc->vmc_space = NULL; + } + mutex_exit(&vmc->vmc_lock); + return (next); +} + +/* + * Attempt to hold a page at `gpa` inside the referenced vmspace. + */ +vm_page_t * +vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) +{ + vmspace_t *vms = vmc->vmc_space; + vm_page_t *vmp; + pfn_t pfn = PFN_INVALID; + uint64_t *ptep = NULL; + + ASSERT0(gpa & PAGEOFFSET); + ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); + + vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); + if (vmc_activate(vmc) != 0) { + kmem_free(vmp, sizeof (*vmp)); + return (NULL); + } + + if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { + vmc_deactivate(vmc); + kmem_free(vmp, sizeof (*vmp)); + return (NULL); + } + ASSERT(pfn != PFN_INVALID && ptep != NULL); + + vmp->vmp_client = vmc; + vmp->vmp_chain = NULL; + vmp->vmp_gpa = gpa; + vmp->vmp_pfn = pfn; + vmp->vmp_ptep = ptep; + vmp->vmp_obj_ref = NULL; + vmp->vmp_prot = prot; + list_insert_tail(&vmc->vmc_held_pages, vmp); + vmc_deactivate(vmc); + + return (vmp); +} + +int +vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) +{ + vmspace_t *vms = vmc->vmc_space; + int err; + + err = vmc_activate(vmc); + if (err == 0) { + err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); + vmc_deactivate(vmc); + } + + return (err); +} + +/* + * Allocate an additional vm_client_t, based on an existing one. Only the + * associatation with the vmspace is cloned, not existing holds or any + * configured invalidation function. + */ +vm_client_t * +vmc_clone(vm_client_t *vmc) +{ + vmspace_t *vms = vmc->vmc_space; + + return (vmspace_client_alloc(vms)); +} + +/* + * Register a function (and associated data pointer) to be called when an + * address range in the vmspace is invalidated. + */ +int +vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) +{ + int err; + + err = vmc_activate(vmc); + if (err == 0) { + vmc->vmc_inval_func = func; + vmc->vmc_inval_data = data; + vmc_deactivate(vmc); + } + + return (err); +} + +/* + * Destroy a vm_client_t instance. + * + * No pages held through this vm_client_t may be outstanding when performing a + * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to + * vmc_table_exit() has been made). + */ +void +vmc_destroy(vm_client_t *vmc) +{ + mutex_enter(&vmc->vmc_lock); + + VERIFY(list_is_empty(&vmc->vmc_held_pages)); + VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); + + if ((vmc->vmc_state & VCS_ORPHANED) == 0) { + vmspace_t *vms; + + /* + * Deassociation with the parent vmspace must be done carefully: + * The vmspace could attempt to orphan this vm_client while we + * release vmc_lock in order to take vms_lock (the required + * order). The client is marked to indicate that destruction is + * under way. Doing so prevents any racing orphan operation + * from applying to this client, allowing us to deassociate from + * the vmspace safely. + */ + vmc->vmc_state |= VCS_DESTROY; + vms = vmc->vmc_space; + mutex_exit(&vmc->vmc_lock); + + mutex_enter(&vms->vms_lock); + mutex_enter(&vmc->vmc_lock); + list_remove(&vms->vms_clients, vmc); + /* + * If the vmspace began its own destruction operation while we + * were navigating the locks, be sure to notify it about this + * vm_client being deassociated. + */ + cv_signal(&vms->vms_cv); + mutex_exit(&vmc->vmc_lock); + mutex_exit(&vms->vms_lock); + } else { + VERIFY3P(vmc->vmc_space, ==, NULL); + mutex_exit(&vmc->vmc_lock); + } + + mutex_destroy(&vmc->vmc_lock); + cv_destroy(&vmc->vmc_cv); + list_destroy(&vmc->vmc_held_pages); + + kmem_free(vmc, sizeof (*vmc)); +} + +static __inline void * +vmp_ptr(const vm_page_t *vmp) +{ + ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); + + const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); + return ((void *)((uintptr_t)kpm_vbase + paddr)); +} + +/* + * Get a readable kernel-virtual pointer for a held page. + * + * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() + * call to acquire this page reference. + */ +const void * +vmp_get_readable(const vm_page_t *vmp) +{ + ASSERT(vmp->vmp_prot & PROT_READ); + + return (vmp_ptr(vmp)); +} + +/* + * Get a writable kernel-virtual pointer for a held page. + * + * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() + * call to acquire this page reference. + */ +void * +vmp_get_writable(const vm_page_t *vmp) +{ + ASSERT(vmp->vmp_prot & PROT_WRITE); + + return (vmp_ptr(vmp)); +} + +/* + * Get the host-physical PFN for a held page. + */ +pfn_t +vmp_get_pfn(const vm_page_t *vmp) +{ + return (vmp->vmp_pfn); +} + +/* + * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. + */ +void +vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) +{ + ASSERT3P(vmp->vmp_chain, ==, NULL); + + vmp->vmp_chain = to_chain; +} + +/* + * Retrieve the pointer from the page-chaining in `vmp`. + */ +vm_page_t * +vmp_next(const vm_page_t *vmp) +{ + return (vmp->vmp_chain); +} + +static __inline bool +vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) +{ + ASSERT(MUTEX_HELD(&vmc->vmc_lock)); + + bool was_unmapped = false; + + list_remove(&vmc->vmc_held_pages, vmp); + if (vmp->vmp_obj_ref != NULL) { + ASSERT3P(vmp->vmp_ptep, ==, NULL); + + vm_object_release(vmp->vmp_obj_ref); + was_unmapped = true; + } else { + ASSERT3P(vmp->vmp_ptep, !=, NULL); + + if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { + vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; + vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); + } + } + kmem_free(vmp, sizeof (*vmp)); + return (was_unmapped); +} + +/* + * Release held page. Returns true if page resided on region which was + * subsequently unmapped. + */ +bool +vmp_release(vm_page_t *vmp) +{ + vm_client_t *vmc = vmp->vmp_client; + + VERIFY(vmc != NULL); + + mutex_enter(&vmc->vmc_lock); + const bool was_unmapped = vmp_release_inner(vmp, vmc); + mutex_exit(&vmc->vmc_lock); + return (was_unmapped); +} + +/* + * Release a chain of pages which were associated via vmp_chain() (setting + * page-chaining pointer). Returns true if any pages resided upon a region + * which was subsequently unmapped. + * + * All of those pages must have been held through the same vm_client_t. + */ +bool +vmp_release_chain(vm_page_t *vmp) +{ + vm_client_t *vmc = vmp->vmp_client; + bool any_unmapped = false; + + ASSERT(vmp != NULL); + + mutex_enter(&vmc->vmc_lock); + while (vmp != NULL) { + vm_page_t *next = vmp->vmp_chain; + + /* We expect all pages in chain to be from same client */ + ASSERT3P(vmp->vmp_client, ==, vmc); + + if (vmp_release_inner(vmp, vmc)) { + any_unmapped = true; + } + vmp = next; + } + mutex_exit(&vmc->vmc_lock); + return (any_unmapped); +} + + +int +vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, + struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) +{ + vm_object_t *vmo; + int err; + + if (segoff < 0 || len <= 0 || + (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (ENOTSUP); + } + err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); + if (err != 0) { + return (err); + } + + VERIFY(segoff >= 0); + VERIFY(len <= vmo->vmo_size); + VERIFY((len + segoff) <= vmo->vmo_size); + + if (vmo->vmo_type != VMOT_MEM) { + /* Only support memory objects for now */ + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.prot = prot; + svma.offset = segoff; + svma.vmo = vmo; + svma.vmc = NULL; + + err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} + +int +vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags) +{ + + const uintptr_t gpa = (uintptr_t)off; + const size_t size = (uintptr_t)len; + int err; + + if (off < 0 || len <= 0 || + (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.prot = prot; + svma.offset = gpa; + svma.vmo = NULL; + svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); + + err = as_map(as, *addrp, len, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} diff --git a/usr/src/uts/intel/io/vmm/vmm_zsd.c b/usr/src/uts/intel/io/vmm/vmm_zsd.c new file mode 100644 index 0000000000..d396c89e58 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_zsd.c @@ -0,0 +1,220 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/cpuvar.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/types.h> +#include <sys/vmm.h> +#include <sys/vmm_kernel.h> +#include <sys/vmm_impl.h> +#include <sys/zone.h> + +/* + * zone specific data + * + * Zone specific data is used to keep an association between zones and the vmm + * instances that may be running in them. This is used to ensure that vmm + * instances do not outlive their parent zone. + * + * Locking strategy + * + * The global vmm_zsd_lock is held while modifying vmm_zsd_list. + * + * The per zone vz_lock in vmm_zsd_t is held while reading or writing anything + * within in vmm_zsd_t instance. This is important to ensure that there's not + * an accidental VM creating as a zone is going down. + */ + +/* + * One of these per zone. + */ +struct vmm_zsd { + list_t vz_vmms; /* vmm instances in the zone */ + list_node_t vz_linkage; /* link to other zones */ + boolean_t vz_active; /* B_FALSE early in shutdown callback */ + zoneid_t vz_zoneid; + kmutex_t vz_lock; +}; + +static kmutex_t vmm_zsd_lock; /* Protects vmm_zsd_list */ +static list_t vmm_zsd_list; /* Linkage between all zsd instances */ + +static zone_key_t vmm_zsd_key; + +int +vmm_zsd_add_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd; + + ASSERT(sc->vmm_zone != NULL); + + mutex_enter(&vmm_zsd_lock); + + for (zsd = list_head(&vmm_zsd_list); zsd != NULL; + zsd = list_next(&vmm_zsd_list, zsd)) { + if (zsd->vz_zoneid == sc->vmm_zone->zone_id) { + break; + } + } + + VERIFY(zsd != NULL); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + if (!zsd->vz_active) { + mutex_exit(&zsd->vz_lock); + return (ENOSYS); + } + + sc->vmm_zsd = zsd; + list_insert_tail(&zsd->vz_vmms, sc); + + mutex_exit(&zsd->vz_lock); + + return (0); +} + +void +vmm_zsd_rem_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd = sc->vmm_zsd; + + mutex_enter(&zsd->vz_lock); + + list_remove(&zsd->vz_vmms, sc); + sc->vmm_zsd = NULL; + + mutex_exit(&zsd->vz_lock); +} + +static void * +vmm_zsd_create(zoneid_t zid) +{ + vmm_zsd_t *zsd; + zone_t *zone; + + zsd = kmem_zalloc(sizeof (*zsd), KM_SLEEP); + + list_create(&zsd->vz_vmms, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_zsd_linkage)); + + zsd->vz_zoneid = zid; + + mutex_init(&zsd->vz_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * If the vmm module is loaded while this zone is in the midst of + * shutting down, vmm_zsd_destroy() may be called without + * vmm_zsd_shutdown() ever being called. If it is shutting down, there + * is no sense in letting any in-flight VM creation succeed so set + * vz_active accordingly. + * + * zone_find_by_id_nolock() is used rather than zone_find_by_id() + * so that the zone is returned regardless of state. + */ + zone = zone_find_by_id_nolock(zid); + VERIFY(zone != NULL); + zsd->vz_active = zone_status_get(zone) < ZONE_IS_SHUTTING_DOWN; + + mutex_enter(&vmm_zsd_lock); + list_insert_tail(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + return (zsd); +} + +/* + * Tells all runing VMs in the zone to poweroff. This does not reclaim guest + * resources (memory, etc.). + */ +static void +vmm_zsd_shutdown(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&zsd->vz_lock); + + /* + * This may already be B_FALSE. See comment in vmm_zsd_create(). If it + * is already B_FALSE we will take a quick trip through the empty list. + */ + zsd->vz_active = B_FALSE; + + for (sc = list_head(&zsd->vz_vmms); sc != NULL; + sc = list_next(&zsd->vz_vmms, sc)) { + /* Send a poweroff to the VM, whether running or not. */ + (void) vm_suspend(sc->vmm_vm, VM_SUSPEND_POWEROFF); + } + mutex_exit(&zsd->vz_lock); +} + +/* + * Reap all VMs that remain and free up guest resources. + */ +static void +vmm_zsd_destroy(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&vmm_zsd_lock); + list_remove(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + ASSERT(!zsd->vz_active); + + while ((sc = list_remove_head(&zsd->vz_vmms)) != NULL) { + int err; + + /* + * This frees all resources associated with the vm, including + * sc. + */ + err = vmm_do_vm_destroy(sc, B_FALSE); + ASSERT3S(err, ==, 0); + } + + mutex_exit(&zsd->vz_lock); + mutex_destroy(&zsd->vz_lock); + + kmem_free(zsd, sizeof (*zsd)); +} + +void +vmm_zsd_init(void) +{ + mutex_init(&vmm_zsd_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&vmm_zsd_list, sizeof (vmm_zsd_t), + offsetof(vmm_zsd_t, vz_linkage)); + zone_key_create(&vmm_zsd_key, vmm_zsd_create, vmm_zsd_shutdown, + vmm_zsd_destroy); +} + +void +vmm_zsd_fini(void) +{ + /* Calls vmm_zsd_destroy() on all zones. */ + zone_key_delete(vmm_zsd_key); + ASSERT(list_is_empty(&vmm_zsd_list)); + + list_destroy(&vmm_zsd_list); + mutex_destroy(&vmm_zsd_lock); +} diff --git a/usr/src/uts/intel/io/vmm/x86.c b/usr/src/uts/intel/io/vmm/x86.c new file mode 100644 index 0000000000..187c89afd0 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/x86.c @@ -0,0 +1,695 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/x86_archext.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include <machine/vmm.h> + +#include "vmm_host.h" +#include "vmm_ktr.h" +#include "vmm_util.h" +#include "x86.h" + +SYSCTL_DECL(_hw_vmm); + +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "bhyve bhyve "; + +/* Number of times an unknown cpuid leaf was accessed */ +static uint64_t bhyve_xcpuids; + +static int cpuid_leaf_b = 1; + +/* + * Force exposition of the invariant TSC capability, regardless of whether the + * host CPU reports having it. + */ +static int vmm_force_invariant_tsc = 0; + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(uint_t x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} + +int +x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx) +{ + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width = 0, x2apic_id = 0; + unsigned int func, regs[4], logical_cpus = 0, param; + enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + /* + * The function of CPUID is controlled through the provided value of + * %eax (and secondarily %ecx, for certain leaf data). + */ + func = (uint32_t)*rax; + param = (uint32_t)*rcx; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param); + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && func >= 0x80000000) { + if (func > cpu_exthigh) + func = cpu_exthigh; + } else if (func >= 0x40000000) { + if (func > CPUID_VM_HIGH) + func = CPUID_VM_HIGH; + } else if (func > cpu_high) { + func = cpu_high; + } + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(func, param, regs); + break; + case CPUID_8000_0008: + cpuid_count(func, param, regs); + if (vmm_is_svm()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | + logical_cpus; + } + break; + + case CPUID_8000_0001: + cpuid_count(func, param, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + cpuid_count(func, param, regs); + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + + /* + * If the host system possesses an invariant TSC, then + * it is safe to expose to the guest. + * + * If there is measured skew between host TSCs, it will + * be properly offset so guests do not observe any + * change between CPU migrations. + */ + regs[3] &= AMDPM_TSC_INVARIANT; + + /* + * Since illumos avoids deep C-states on CPUs which do + * not support an invariant TSC, it may be safe (and + * desired) to unconditionally expose that capability to + * the guest. + */ + if (vmm_force_invariant_tsc != 0) { + regs[3] |= AMDPM_TSC_INVARIANT; + } + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_svm()) + goto default_leaf; + + /* + * Similar to Intel, generate a ficticious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (param) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = 0; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* + * AMD Family 16h+ and Hygon Family 18h additional + * identifiers. + */ + if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; + regs[1] &= ~CPUID_HTT_CORES; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(func, param, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (param == 0) { + cpuid_count(func, param, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | + CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_SMAP | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); + regs[2] = 0; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Intel processor topology enumeration + */ + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (param == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (param == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || param >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (param & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(func, param, regs); + switch (param) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << param))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case CPUID_0000_000F: + case CPUID_0000_0010: + /* + * Do not report any Resource Director Technology + * capabilities. Exposing control of cache or memory + * controller resource partitioning to the guest is not + * at all sensible. + * + * This is already hidden at a high level by masking of + * leaf 0x7. Even still, a guest may look here for + * detailed capability information. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0015: + /* + * Don't report CPU TSC/Crystal ratio and clock + * values since guests may use these to derive the + * local APIC frequency.. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: +default_leaf: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(func, param, regs); + break; + } + + /* + * CPUID clears the upper 32-bits of the long-mode registers. + */ + *rax = regs[0]; + *rbx = regs[1]; + *rcx = regs[2]; + *rdx = regs[3]; + + return (1); +} + +bool +vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { +#ifdef __FreeBSD__ + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; +#else + case VCC_NO_EXECUTE: + if (is_x86_feature(x86_featureset, X86FSET_NX)) + rv = true; + break; + /* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */ + case VCC_FFXSR: + case VCC_TCE: + break; +#endif + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} diff --git a/usr/src/uts/intel/io/vmm/x86.h b/usr/src/uts/intel/io/vmm/x86.h new file mode 100644 index 0000000000..f3459e4f8a --- /dev/null +++ b/usr/src/uts/intel/io/vmm/x86.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_0000_000F (0xF) +#define CPUID_0000_0010 (0x10) +#define CPUID_0000_0015 (0x15) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx); + +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability); +#endif diff --git a/usr/src/uts/intel/os/hma.c b/usr/src/uts/intel/os/hma.c new file mode 100644 index 0000000000..215243ea98 --- /dev/null +++ b/usr/src/uts/intel/os/hma.c @@ -0,0 +1,749 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/cpuvar.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/machsystm.h> +#include <sys/archsystm.h> +#include <sys/controlregs.h> +#include <sys/x86_archext.h> +#include <sys/id_space.h> +#include <sys/hma.h> +#include <sys/cmn_err.h> +#include <vm/hat.h> +#include <vm/as.h> + +struct hma_reg { + const char *hr_name; + list_node_t hr_node; +}; + +static kmutex_t hma_lock; +static list_t hma_registrations; +static boolean_t hma_exclusive = B_FALSE; +int hma_disable = 0; + +static boolean_t hma_vmx_ready = B_FALSE; +static const char *hma_vmx_error = NULL; +static id_space_t *hma_vmx_vpid; + +/* + * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a + * mutex specific to the module. It (cpu_lock) is already required for the + * state needed to perform setup on all CPUs, so it was a natural fit to + * protect this data too. + */ +typedef enum hma_cpu_state { + HCS_UNINITIALIZED = 0, + HCS_READY, + HCS_ERROR +} hma_cpu_state_t; +static hma_cpu_state_t hma_cpu_status[NCPU]; + +/* HMA-internal tracking of optional VMX capabilities */ +typedef enum { + HVC_EPT = (1 << 0), + HVC_VPID = (1 << 1), + HVC_INVEPT_ONE = (1 << 2), + HVC_INVEPT_ALL = (1 << 3), +} hma_vmx_capab_t; + +static void *hma_vmx_vmxon_page[NCPU]; +static uintptr_t hma_vmx_vmxon_pa[NCPU]; +static uint32_t hma_vmx_revision; +static hma_vmx_capab_t hma_vmx_capabs = 0; + +static boolean_t hma_svm_ready = B_FALSE; +static const char *hma_svm_error = NULL; +static uint32_t hma_svm_features; +static uint32_t hma_svm_max_asid; + +static void *hma_svm_hsave_page[NCPU]; +static uintptr_t hma_svm_hsave_pa[NCPU]; + +static hma_svm_asid_t hma_svm_cpu_asid[NCPU]; + + +static int hma_vmx_init(void); +static int hma_svm_init(void); + +/* Helpers from ml/hma_asm.s */ +int hma_vmx_do_invept(int, uintptr_t); +int hma_vmx_vmxon(uintptr_t); + +void +hma_init(void) +{ + mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&hma_registrations, sizeof (struct hma_reg), + offsetof(struct hma_reg, hr_node)); + + if (hma_disable != 0) { + cmn_err(CE_CONT, "?hma_init: disabled"); + return; + } + + switch (cpuid_getvendor(CPU)) { + case X86_VENDOR_Intel: + (void) hma_vmx_init(); + break; + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + (void) hma_svm_init(); + break; + default: + break; + } +} + +static hma_reg_t * +hma_register_backend(const char *name) +{ + struct hma_reg *reg; + boolean_t is_ready; + + ASSERT(MUTEX_HELD(&hma_lock)); + + switch (cpuid_getvendor(CPU)) { + case X86_VENDOR_Intel: + is_ready = hma_vmx_ready; + break; + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + is_ready = hma_svm_ready; + break; + default: + is_ready = B_FALSE; + break; + } + + if (!is_ready) + return (NULL); + + reg = kmem_zalloc(sizeof (*reg), KM_SLEEP); + reg->hr_name = name; + list_insert_tail(&hma_registrations, reg); + + return (reg); +} + +hma_reg_t * +hma_register(const char *name) +{ + struct hma_reg *reg = NULL; + + VERIFY(name != NULL); + + mutex_enter(&hma_lock); + + if (!hma_exclusive) + reg = hma_register_backend(name); + + mutex_exit(&hma_lock); + + return (reg); +} + +hma_reg_t * +hma_register_exclusive(const char *name) +{ + struct hma_reg *reg = NULL; + + VERIFY(name != NULL); + + mutex_enter(&hma_lock); + + if (list_is_empty(&hma_registrations)) { + reg = hma_register_backend(name); + if (reg != NULL) + hma_exclusive = B_TRUE; + } + + mutex_exit(&hma_lock); + + return (reg); +} + +void +hma_unregister(hma_reg_t *reg) +{ + VERIFY(reg != NULL); + VERIFY(!list_is_empty(&hma_registrations)); + + mutex_enter(&hma_lock); + list_remove(&hma_registrations, reg); + if (hma_exclusive && list_is_empty(&hma_registrations)) + hma_exclusive = B_FALSE; + mutex_exit(&hma_lock); + kmem_free(reg, sizeof (*reg)); +} + +/* + * VPID 0 is reserved for instances where VPID is disabled. Some hypervisors + * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if + * unique VPIDs could not be allocated for all the vCPUs belonging to a VM. + */ +#define HMA_VPID_RESERVED NCPU + +uint16_t +hma_vmx_vpid_alloc(void) +{ + id_t res; + + /* Do not bother if the CPU lacks support */ + if ((hma_vmx_capabs & HVC_VPID) == 0) { + return (0); + } + + res = id_alloc_nosleep(hma_vmx_vpid); + if (res == -1) { + return (0); + } else { + ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX); + return (res); + } +} + +void +hma_vmx_vpid_free(uint16_t vpid) +{ + VERIFY(vpid > HMA_VPID_RESERVED); + id_free(hma_vmx_vpid, (id_t)vpid); +} + +#define INVEPT_SINGLE_CONTEXT 1 +#define INVEPT_ALL_CONTEXTS 2 + +static int +hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused) +{ + int flag = (int)arg1; + uintptr_t eptp = (uintptr_t)arg2; + + ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS); + + VERIFY0(hma_vmx_do_invept(flag, eptp)); + return (0); +} + +void +hma_vmx_invept_allcpus(uintptr_t eptp) +{ + int flag = -1; + cpuset_t set; + + if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) { + flag = INVEPT_SINGLE_CONTEXT; + } else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { + flag = INVEPT_ALL_CONTEXTS; + eptp = 0; + } else { + return; + } + + cpuset_zero(&set); + mutex_enter(&cpu_lock); + + cpuset_or(&set, &cpu_active_set); + xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set), + hma_vmx_invept_xcall); + + mutex_exit(&cpu_lock); +} + +static int +hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) +{ + uint64_t fctrl; + processorid_t id = CPU->cpu_seqid; + void *vmxon_region = hma_vmx_vmxon_page[id]; + uintptr_t vmxon_pa = hma_vmx_vmxon_pa[id]; + + VERIFY(vmxon_region != NULL && vmxon_pa != 0); + + /* + * Ensure that the VMX support and lock bits are enabled in the + * feature-control MSR. + */ + fctrl = rdmsr(MSR_IA32_FEAT_CTRL); + if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 || + (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) { + fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK; + wrmsr(MSR_IA32_FEAT_CTRL, fctrl); + } + + setcr4(getcr4() | CR4_VMXE); + + if (hma_vmx_vmxon(vmxon_pa) == 0) { + hma_cpu_status[id] = HCS_READY; + } else { + hma_cpu_status[id] = HCS_ERROR; + + /* + * If VMX has already been marked active and available for the + * system, then failure to perform VMXON on a newly-onlined CPU + * represents a fatal problem. Continuing on would mean + * failure for any hypervisor thread which landed here. + */ + if (hma_vmx_ready) { + panic("VMXON failure after VMX marked ready"); + } + } + return (0); +} + +static int +hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused) +{ + hma_cpu_state_t state; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(id >= 0 && id < NCPU); + + if (what != CPU_ON) { + /* + * For the purposes of VMX setup, only the CPU_ON event is of + * interest. Letting VMX state linger on an offline CPU should + * not cause any harm. + * + * This logic assumes that any offlining activity is strictly + * administrative in nature and will not alter any existing + * configuration (such as %cr4 bits previously set). + */ + return (0); + } + + state = hma_cpu_status[id]; + if (state == HCS_ERROR) { + return (-1); + } + + /* Allocate the VMXON page for this CPU, if not already done */ + if (hma_vmx_vmxon_page[id] == NULL) { + caddr_t va; + pfn_t pfn; + + va = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY0((uintptr_t)va & PAGEOFFSET); + hma_vmx_vmxon_page[id] = va; + + /* Initialize the VMX revision field as expected */ + bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision)); + + /* + * Cache the physical address of the VMXON page rather than + * looking it up later when the potential blocking of + * hat_getpfnum would be less acceptable. + */ + pfn = hat_getpfnum(kas.a_hat, va); + hma_vmx_vmxon_pa[id] = (pfn << PAGESHIFT); + } else { + VERIFY(hma_vmx_vmxon_pa[id] != 0); + } + + if (state == HCS_UNINITIALIZED) { + cpuset_t set; + + /* Activate VMX on this CPU */ + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon); + } else { + VERIFY3U(state, ==, HCS_READY); + + /* + * If an already-initialized CPU is going back online, perform + * an all-contexts invept to eliminate the possibility of + * cached EPT state causing issues. + */ + if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { + cpuset_t set; + + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0, + CPUSET2BV(set), hma_vmx_invept_xcall); + } + } + + return (hma_cpu_status[id] != HCS_READY); +} + +/* + * Determining the availability of VM execution controls is somewhat different + * from conventional means, where one simply checks for asserted bits in the + * MSR value. Instead, these execution control MSRs are split into two halves: + * the lower 32-bits indicating capabilities which can be zeroed in the VMCS + * field and the upper 32-bits indicating capabilities which can be set to one. + * + * It is described in detail in Appendix A.3 of SDM volume 3. + */ +#define VMX_CTL_ONE_SETTING(val, flag) \ + (((val) & ((uint64_t)(flag) << 32)) != 0) + +static const char * +hma_vmx_query_details(void) +{ + boolean_t query_true_ctl = B_FALSE; + uint64_t msr; + + /* The basic INS/OUTS functionality is cited as a necessary prereq */ + msr = rdmsr(MSR_IA32_VMX_BASIC); + if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) { + return ("VMX does not support INS/OUTS"); + } + + /* Record the VMX revision for later VMXON usage */ + hma_vmx_revision = (uint32_t)msr; + + /* + * Bit 55 in the VMX_BASIC MSR determines how VMX control information + * can be queried. + */ + query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0; + + /* Check for EPT and VPID support */ + msr = rdmsr(query_true_ctl ? + MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS); + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) { + msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS); + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) { + hma_vmx_capabs |= HVC_EPT; + } + if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) { + hma_vmx_capabs |= HVC_VPID; + } + } + + /* Check for INVEPT support */ + if ((hma_vmx_capabs & HVC_EPT) != 0) { + msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) { + if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) { + hma_vmx_capabs |= HVC_INVEPT_ONE; + } + if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) { + hma_vmx_capabs |= HVC_INVEPT_ALL; + } + } + } + + return (NULL); +} + +static int +hma_vmx_init(void) +{ + cpu_t *cp; + uint64_t msr; + int err = 0; + const char *msg = NULL; + + if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { + msg = "CPU does not support VMX"; + goto bail; + } + + /* Has the BIOS set the feature-control lock bit without VMX enabled? */ + msr = rdmsr(MSR_IA32_FEAT_CTRL); + if ((msr & IA32_FEAT_CTRL_LOCK) != 0 && + (msr & IA32_FEAT_CTRL_VMX_EN) == 0) { + msg = "VMX support disabled by BIOS"; + goto bail; + } + + msg = hma_vmx_query_details(); + if (msg != NULL) { + goto bail; + } + + mutex_enter(&cpu_lock); + /* Perform VMX configuration for already-online CPUs. */ + cp = cpu_active; + do { + err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); + if (err != 0) { + msg = "failure during VMXON setup"; + mutex_exit(&cpu_lock); + goto bail; + } + } while ((cp = cp->cpu_next_onln) != cpu_active); + + /* + * Register callback for later-onlined CPUs and perform other remaining + * resource allocation. + */ + register_cpu_setup_func(hma_vmx_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1, + UINT16_MAX); + hma_vmx_ready = B_TRUE; + + return (0); + +bail: + hma_vmx_error = msg; + cmn_err(CE_NOTE, "hma_vmx_init: %s", msg); + return (-1); +} + +#define VMCB_FLUSH_NOTHING 0x0 +#define VMCB_FLUSH_ALL 0x1 +#define VMCB_FLUSH_ASID 0x3 + +void +hma_svm_asid_init(hma_svm_asid_t *vcp) +{ + /* + * Initialize the generation to 0, forcing an ASID allocation on first + * entry. Leave the ASID at 0, so if the host forgoes the call to + * hma_svm_asid_update(), SVM will bail on the invalid vcpu state. + */ + vcp->hsa_gen = 0; + vcp->hsa_asid = 0; +} + +uint8_t +hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid, + boolean_t npt_flush) +{ + hma_svm_asid_t *hcp; + ulong_t iflag; + uint8_t res = VMCB_FLUSH_NOTHING; + + /* + * If NPT changes dictate a TLB flush and by-ASID flushing is not + * supported/used, force a fresh ASID allocation. + */ + if (npt_flush && !flush_by_asid) { + vcp->hsa_gen = 0; + } + + /* + * It is expected that ASID resource updates will commonly be done + * inside a VMM critical section where the GIF is already cleared, + * preventing any possibility of interruption. Since that cannot be + * checked (there is no easy way to read the GIF), %rflags.IF is also + * cleared for edge cases where an ASID update is performed outside of + * such a GIF-safe critical section. + */ + iflag = intr_clear(); + + hcp = &hma_svm_cpu_asid[CPU->cpu_seqid]; + if (vcp->hsa_gen != hcp->hsa_gen) { + hcp->hsa_asid++; + + if (hcp->hsa_asid >= hma_svm_max_asid) { + /* Keep the ASID properly constrained */ + hcp->hsa_asid = 1; + hcp->hsa_gen++; + if (hcp->hsa_gen == 0) { + /* + * Stay clear of the '0' sentinel value for + * generation, if wrapping around. + */ + hcp->hsa_gen = 1; + } + } + vcp->hsa_gen = hcp->hsa_gen; + vcp->hsa_asid = hcp->hsa_asid; + + ASSERT(vcp->hsa_asid != 0); + ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid); + + if (flush_by_asid) { + res = VMCB_FLUSH_ASID; + } else { + res = VMCB_FLUSH_ALL; + } + } else if (npt_flush) { + ASSERT(flush_by_asid); + res = VMCB_FLUSH_ASID; + } + + intr_restore(iflag); + return (res); +} + +static int +hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) +{ + const processorid_t id = CPU->cpu_seqid; + const uintptr_t hsave_pa = hma_svm_hsave_pa[id]; + uint64_t efer; + + VERIFY(hsave_pa != 0); + + /* Enable SVM via EFER */ + efer = rdmsr(MSR_AMD_EFER); + efer |= AMD_EFER_SVME; + wrmsr(MSR_AMD_EFER, efer); + + /* Setup hsave area */ + wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa); + + hma_cpu_status[id] = HCS_READY; + return (0); +} + +static int +hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(id >= 0 && id < NCPU); + + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + break; + default: + /* + * Other events, such as CPU offlining, are of no interest. + * Letting the SVM state linger should not cause any harm. + * + * This logic assumes that any offlining activity is strictly + * administrative in nature and will not alter any existing + * configuration (such as EFER bits previously set). + */ + return (0); + } + + /* Perform initialization if it has not been previously attempted. */ + if (hma_cpu_status[id] != HCS_UNINITIALIZED) { + return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1); + } + + /* Allocate the hsave page for this CPU */ + if (hma_svm_hsave_page[id] == NULL) { + caddr_t va; + pfn_t pfn; + + va = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY0((uintptr_t)va & PAGEOFFSET); + hma_svm_hsave_page[id] = va; + + /* + * Cache the physical address of the hsave page rather than + * looking it up later when the potential blocking of + * hat_getpfnum would be less acceptable. + */ + pfn = hat_getpfnum(kas.a_hat, va); + hma_svm_hsave_pa[id] = (pfn << PAGESHIFT); + } else { + VERIFY(hma_svm_hsave_pa[id] != 0); + } + + kpreempt_disable(); + if (CPU->cpu_seqid == id) { + /* Perform svm setup directly if this CPU is the target */ + (void) hma_svm_cpu_activate(0, 0, 0); + kpreempt_enable(); + } else { + cpuset_t set; + + /* Use a cross-call if a remote CPU is the target */ + kpreempt_enable(); + cpuset_zero(&set); + cpuset_add(&set, id); + xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate); + } + + return (hma_cpu_status[id] != HCS_READY); +} + +static int +hma_svm_init(void) +{ + uint64_t msr; + const char *msg = NULL; + struct cpuid_regs regs; + cpu_t *cp; + + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + msg = "CPU does not support SVM"; + goto bail; + } + + msr = rdmsr(MSR_AMD_VM_CR); + if ((msr & AMD_VM_CR_SVMDIS) != 0) { + msg = "SVM disabled by BIOS"; + goto bail; + } + + regs.cp_eax = 0x8000000a; + (void) cpuid_insn(NULL, ®s); + const uint32_t nasid = regs.cp_ebx; + const uint32_t feat = regs.cp_edx; + + if (nasid == 0) { + msg = "Not enough ASIDs for guests"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) { + msg = "CPU does not support nested paging"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NRIPS) == 0) { + msg = "CPU does not support NRIP save"; + goto bail; + } + + hma_svm_features = feat; + hma_svm_max_asid = nasid; + + mutex_enter(&cpu_lock); + /* Perform SVM configuration for already-online CPUs. */ + cp = cpu_active; + do { + int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); + if (err != 0) { + msg = "failure during SVM setup"; + mutex_exit(&cpu_lock); + goto bail; + } + } while ((cp = cp->cpu_next_onln) != cpu_active); + + /* + * Register callback for later-onlined CPUs and perform other remaining + * resource allocation. + */ + register_cpu_setup_func(hma_svm_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + /* Initialize per-CPU ASID state. */ + for (uint_t i = 0; i < NCPU; i++) { + /* + * Skip past sentinel 0 value for generation. Doing so for + * ASID is unneeded, since it will be incremented during the + * first allocation. + */ + hma_svm_cpu_asid[i].hsa_gen = 1; + hma_svm_cpu_asid[i].hsa_asid = 0; + } + + hma_svm_ready = B_TRUE; + return (0); + +bail: + hma_svm_error = msg; + cmn_err(CE_NOTE, "hma_svm_init: %s", msg); + return (-1); +} diff --git a/usr/src/uts/intel/os/hma_fpu.c b/usr/src/uts/intel/os/hma_fpu.c new file mode 100644 index 0000000000..138af7a32a --- /dev/null +++ b/usr/src/uts/intel/os/hma_fpu.c @@ -0,0 +1,465 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ + +/* + * This implements the hypervisor multiplexor FPU API. Its purpose is to make it + * easy to switch between the host and guest hypervisor while hiding all the + * details about CR0.TS and how to save the host's state as required. + */ + +#include <sys/pcb.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/hma.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/controlregs.h> +#include <sys/sysmacros.h> +#include <sys/stdbool.h> +#include <sys/ontrap.h> +#include <sys/cpuvar.h> +#include <sys/disp.h> + +struct hma_fpu { + fpu_ctx_t hf_guest_fpu; + kthread_t *hf_curthread; + boolean_t hf_inguest; +}; + +int +hma_fpu_init(hma_fpu_t *fpu) +{ + struct xsave_state *xs; + + ASSERT0(fpu->hf_inguest); + + switch (fp_save_mech) { + case FP_FXSAVE: + bcopy(&sse_initial, fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx, + sizeof (struct fxsave_state)); + fpu->hf_guest_fpu.fpu_xsave_mask = 0; + break; + case FP_XSAVE: + /* + * Zero everything in the xsave case as we may have data in + * the structure that's not part of the initial value (which + * only really deals with a small portion of the xsave state). + */ + xs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs; + bzero(xs, cpuid_get_xsave_size()); + bcopy(&avx_initial, xs, sizeof (*xs)); + xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; + fpu->hf_guest_fpu.fpu_xsave_mask = XFEATURE_FP_ALL; + break; + default: + panic("Invalid fp_save_mech"); + } + + fpu->hf_guest_fpu.fpu_flags = FPU_EN | FPU_VALID; + + return (0); +} + +void +hma_fpu_free(hma_fpu_t *fpu) +{ + if (fpu == NULL) + return; + + ASSERT3P(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, !=, NULL); + kmem_cache_free(fpsave_cachep, + fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic); + kmem_free(fpu, sizeof (*fpu)); +} + +hma_fpu_t * +hma_fpu_alloc(int kmflag) +{ + hma_fpu_t *fpu; + + fpu = kmem_zalloc(sizeof (hma_fpu_t), kmflag); + if (fpu == NULL) + return (NULL); + + fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic = + kmem_cache_alloc(fpsave_cachep, kmflag); + if (fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic == NULL) { + kmem_free(fpu, sizeof (hma_fpu_t)); + return (NULL); + } + fpu->hf_inguest = B_FALSE; + + /* + * Make sure the entire structure is zero. + */ + switch (fp_save_mech) { + case FP_FXSAVE: + bzero(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, + sizeof (struct fxsave_state)); + break; + case FP_XSAVE: + bzero(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, + cpuid_get_xsave_size()); + break; + default: + panic("Invalid fp_save_mech"); + } + + return (fpu); +} + +void +hma_fpu_start_guest(hma_fpu_t *fpu) +{ + /* + * Note, we don't check / assert whether or not t_prempt is true because + * there are contexts where this is safe to call (from a context op) + * where t_preempt may not be set. + */ + ASSERT3S(fpu->hf_inguest, ==, B_FALSE); + ASSERT3P(fpu->hf_curthread, ==, NULL); + ASSERT3P(curthread->t_lwp, !=, NULL); + ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_EN, !=, 0); + ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_VALID, !=, 0); + + fpu->hf_inguest = B_TRUE; + fpu->hf_curthread = curthread; + + + fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu); + fp_restore(&fpu->hf_guest_fpu); + fpu->hf_guest_fpu.fpu_flags &= ~FPU_VALID; +} + +/* + * Since fp_save() assumes a thread-centric view of the FPU usage -- it will + * assert if attempting to save elsewhere than the thread PCB, and will elide + * action if the FPU is not enabled -- we cannot use it for the manual saving of + * FPU contents. To work around that, we call the save mechanism directly. + */ +static void +do_fp_save(fpu_ctx_t *fpu) +{ + /* + * For our manual saving, we expect that the thread PCB never be the + * landing zone for the data. + */ + ASSERT(curthread->t_lwp == NULL || + fpu != &curthread->t_lwp->lwp_pcb.pcb_fpu); + + switch (fp_save_mech) { + case FP_FXSAVE: + fpxsave(fpu->fpu_regs.kfpu_u.kfpu_fx); + break; + case FP_XSAVE: + xsavep(fpu->fpu_regs.kfpu_u.kfpu_xs, fpu->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + } + fpu->fpu_flags |= FPU_VALID; +} + + +void +hma_fpu_stop_guest(hma_fpu_t *fpu) +{ + ASSERT3S(fpu->hf_inguest, ==, B_TRUE); + ASSERT3P(fpu->hf_curthread, ==, curthread); + ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_EN, !=, 0); + ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_VALID, ==, 0); + + do_fp_save(&fpu->hf_guest_fpu); + + fp_restore(&curthread->t_lwp->lwp_pcb.pcb_fpu); + + fpu->hf_inguest = B_FALSE; + fpu->hf_curthread = NULL; +} + +/* + * Will output up to `ndesc` records into `descp`. The required size for an + * XSAVE area containing all of the data fields supported by the host will be + * placed in `req_sizep` (if non-NULL). Returns the number of feature bits + * supported by the host. + */ +uint_t +hma_fpu_describe_xsave_state(hma_xsave_state_desc_t *descp, uint_t ndesc, + size_t *req_sizep) +{ + uint64_t features; + + switch (fp_save_mech) { + case FP_FXSAVE: + /* + * Even without xsave support, the FPU will have legacy x87 + * float and SSE state contained within. + */ + features = XFEATURE_LEGACY_FP | XFEATURE_SSE; + break; + case FP_XSAVE: + features = get_xcr(XFEATURE_ENABLED_MASK); + break; + default: + panic("Invalid fp_save_mech"); + } + + uint_t count, pos; + uint_t max_size = MIN_XSAVE_SIZE; + for (count = 0, pos = 0; pos <= 63; pos++) { + const uint64_t bit = (1 << pos); + uint32_t size, off; + + if ((features & bit) == 0) { + continue; + } + + if (bit == XFEATURE_LEGACY_FP || bit == XFEATURE_SSE) { + size = sizeof (struct fxsave_state); + off = 0; + } else { + /* + * Size and position of data types within the XSAVE area + * is described in leaf 0xD in the subfunction + * corresponding to the bit position (for pos > 1). + */ + struct cpuid_regs regs = { + .cp_eax = 0xD, + .cp_ecx = pos, + }; + + ASSERT3U(pos, >, 1); + + (void) __cpuid_insn(®s); + size = regs.cp_eax; + off = regs.cp_ebx; + } + max_size = MAX(max_size, off + size); + + if (count < ndesc) { + hma_xsave_state_desc_t *desc = &descp[count]; + + desc->hxsd_bit = bit; + desc->hxsd_size = size; + desc->hxsd_off = off; + } + count++; + } + if (req_sizep != NULL) { + *req_sizep = max_size; + } + return (count); +} + +hma_fpu_xsave_result_t +hma_fpu_get_xsave_state(const hma_fpu_t *fpu, void *buf, size_t len) +{ + ASSERT(!fpu->hf_inguest); + + size_t valid_len; + switch (fp_save_mech) { + case FP_FXSAVE: { + if (len < MIN_XSAVE_SIZE) { + return (HFXR_NO_SPACE); + } + bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf, + sizeof (struct fxsave_state)); + + struct xsave_header hdr = { + .xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE, + }; + bcopy(&hdr, buf + sizeof (struct fxsave_state), sizeof (hdr)); + + break; + } + case FP_XSAVE: + (void) hma_fpu_describe_xsave_state(NULL, 0, &valid_len); + if (len < valid_len) { + return (HFXR_NO_SPACE); + } + bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf, + valid_len); + break; + default: + panic("Invalid fp_save_mech"); + } + + return (HFXR_OK); +} + +hma_fpu_xsave_result_t +hma_fpu_set_xsave_state(hma_fpu_t *fpu, void *buf, size_t len) +{ + ASSERT(!fpu->hf_inguest); + + if (len < MIN_XSAVE_SIZE) { + return (HFXR_NO_SPACE); + } + /* 64-byte alignment is demanded of the FPU-related operations */ + if (((uintptr_t)buf & 63) != 0) { + return (HFXR_BAD_ALIGN); + } + + struct xsave_header *hdr = buf + sizeof (struct fxsave_state); + if (hdr->xsh_xcomp_bv != 0) { + /* XSAVEC formatting not supported at this time */ + return (HFXR_UNSUP_FMT); + } + + uint64_t allowed_bits; + size_t save_area_size; + switch (fp_save_mech) { + case FP_FXSAVE: + allowed_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE; + save_area_size = sizeof (struct fxsave_state); + break; + case FP_XSAVE: + allowed_bits = get_xcr(XFEATURE_ENABLED_MASK); + save_area_size = cpuid_get_xsave_size(); + break; + default: + panic("Invalid fp_save_mech"); + } + if ((hdr->xsh_xstate_bv & ~(allowed_bits)) != 0) { + return (HFXR_UNSUP_FEAT); + } + + /* + * We validate the incoming state with the FPU itself prior to saving it + * into the guest FPU context area. In order to preserve any state + * currently housed in the FPU, we save it to a temporarily allocated + * FPU context. It is important to note that we are not following the + * normal rules around state management detailed in uts/intel/os/fpu.c. + * This saving is unconditional, uncaring about the state in the FPU or + * the value of CR0_TS, simplifying our process before returning to the + * caller (without needing to chcek of an lwp, etc). To prevent + * interrupting threads from encountering this unusual FPU state, we + * keep interrupts disabled for the duration. + */ + fpu_ctx_t temp_ctx = { + .fpu_xsave_mask = XFEATURE_FP_ALL, + }; + temp_ctx.fpu_regs.kfpu_u.kfpu_generic = + kmem_cache_alloc(fpsave_cachep, KM_SLEEP); + bzero(temp_ctx.fpu_regs.kfpu_u.kfpu_generic, save_area_size); + + ulong_t iflag; + iflag = intr_clear(); + bool disable_when_done = (getcr0() & CR0_TS) != 0; + do_fp_save(&temp_ctx); + + /* + * If the provided data is invalid, it will cause a #GP when we attempt + * to load it into the FPU, so protect against that with on_trap(). + * Should the data load successfully, we can then be confident that its + * later use in via hma_fpu_start_guest() will be safe. + */ + on_trap_data_t otd; + volatile hma_fpu_xsave_result_t res = HFXR_OK; + if (on_trap(&otd, OT_DATA_EC) != 0) { + res = HFXR_INVALID_DATA; + goto done; + } + + switch (fp_save_mech) { + case FP_FXSAVE: + if (hdr->xsh_xstate_bv == 0) { + /* + * An empty xstate_bv means we can simply load the + * legacy FP/SSE area with their initial state. + */ + bcopy(&sse_initial, + fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx, + sizeof (sse_initial)); + } else { + fpxrestore(buf); + fpxsave(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx); + } + break; + case FP_XSAVE: + xrestore(buf, XFEATURE_FP_ALL); + xsavep(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs, + fpu->hf_guest_fpu.fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + } + +done: + no_trap(); + fp_restore(&temp_ctx); + if (disable_when_done) { + fpdisable(); + } + intr_restore(iflag); + kmem_cache_free(fpsave_cachep, temp_ctx.fpu_regs.kfpu_u.kfpu_generic); + + return (res); +} + +void +hma_fpu_get_fxsave_state(const hma_fpu_t *fpu, struct fxsave_state *fx) +{ + const struct fxsave_state *guest; + + ASSERT3S(fpu->hf_inguest, ==, B_FALSE); + + guest = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx; + bcopy(guest, fx, sizeof (*fx)); +} + +int +hma_fpu_set_fxsave_state(hma_fpu_t *fpu, const struct fxsave_state *fx) +{ + struct fxsave_state *gfx; + struct xsave_state *gxs; + + ASSERT3S(fpu->hf_inguest, ==, B_FALSE); + + /* + * If reserved bits are set in fx_mxcsr, then we will take a #GP when + * we restore them. Reject this outright. + * + * We do not need to check if we are dealing with state that has pending + * exceptions. This was only the case with the original FPU save and + * restore mechanisms (fsave/frstor). When using fxsave/fxrstor and + * xsave/xrstor they will be deferred to the user using the FPU, which + * is what we'd want here (they'd be used in guest context). + */ + if ((fx->fx_mxcsr & ~sse_mxcsr_mask) != 0) + return (EINVAL); + + switch (fp_save_mech) { + case FP_FXSAVE: + gfx = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx; + bcopy(fx, gfx, sizeof (*fx)); + break; + case FP_XSAVE: + gxs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs; + bzero(gxs, cpuid_get_xsave_size()); + bcopy(fx, &gxs->xs_fxsave, sizeof (*fx)); + gxs->xs_header.xsh_xstate_bv = + XFEATURE_LEGACY_FP | XFEATURE_SSE; + break; + default: + panic("Invalid fp_save_mech"); + } + + return (0); +} diff --git a/usr/src/uts/intel/os/smt.c b/usr/src/uts/intel/os/smt.c new file mode 100644 index 0000000000..7ba9d3025d --- /dev/null +++ b/usr/src/uts/intel/os/smt.c @@ -0,0 +1,770 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * SMT exclusion: prevent a sibling in a hyper-threaded core from running in VMX + * non-root guest mode, when certain threads are running on the other sibling. + * This avoids speculation-based information leaks such as L1TF being available + * to the untrusted guest. The stance we take is that threads from the same + * zone as the guest VPCU thread are considered safe to run alongside, but all + * other threads (except the idle thread), and all interrupts, are unsafe. Note + * that due to the implementation here, there are significant sections of e.g. + * the dispatcher code that can run concurrently with a guest, until the thread + * reaches smt_mark(). This code assumes there are only two SMT threads per + * core. + * + * The entry points are as follows: + * + * smt_mark_as_vcpu() + * + * All threads that enter guest mode (i.e. VCPU threads) need to call this at + * least once, which sets TS_VCPU in ->t_schedflag. + * + * smt_mark() + * + * A new ->cpu_thread is now curthread (although interrupt threads have their + * own separate handling). After preventing any interrupts, we will take our + * own CPU's spinlock and update our own state in mcpu_smt. + * + * If our sibling is poisoned (i.e. in guest mode or the little bit of code + * around it), and we're not compatible (that is, same zone ID, or the idle + * thread), then we need to smt_kick() that sibling. smt_kick() itself waits + * for the sibling to call smt_release(), and it will not re-enter guest mode + * until allowed. + * + * Note that we ignore the fact a process can change its zone ID: poisoning + * threads never do so, and we can ignore the other cases. + * + * smt_acquire() + * + * We are a VCPU thread about to start guest execution. Interrupts are + * disabled. We must have already run smt_mark() to be in this code, so there's + * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED. + * Instead, we take our sibling's lock to also mark ourselves as poisoned in the + * sibling cpu_smt_t. This is so smt_mark() will only ever need to look at its + * local mcpu_smt. + * + * We'll loop here for up to smt_acquire_wait_time microseconds; this is mainly + * to wait out any sibling interrupt: many of them will complete quicker than + * this. + * + * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as + * mitigation against L1TF: no incompatible thread will now be able to populate + * the L1 cache until *we* smt_release(). + * + * smt_release() + * + * Simply unpoison ourselves similarly to smt_acquire(); smt_kick() will wait + * for this to happen if needed. + * + * smt_begin_intr() + * + * In an interrupt prolog. We're either a hilevel interrupt, or a pinning + * interrupt. In both cases, we mark our interrupt depth, and potentially + * smt_kick(). This enforces exclusion, but doesn't otherwise modify + * ->cs_state: we want the dispatcher code to essentially ignore interrupts. + * + * smt_end_intr() + * + * In an interrupt epilogue *or* thread_unpin(). In the first case, we never + * slept, and we can simply decrement our counter. In the second case, we're an + * interrupt thread about to sleep: we'll still just decrement our counter, and + * henceforth treat the thread as a normal thread when it next gets scheduled, + * until it finally gets to its epilogue. + * + * smt_mark_unsafe() / smt_mark_safe() + * + * Mark the current thread as temporarily unsafe (guests should not be executing + * while a sibling is marked unsafe). This can be used for a thread that's + * otherwise considered safe, if it needs to handle potentially sensitive data. + * Right now, this means certain I/O handling operations that reach down into + * the networking and ZFS sub-systems. + * + * smt_should_run(thread, cpu) + * + * This is used by the dispatcher when making scheduling decisions: if the + * sibling is compatible with the given thread, we return B_TRUE. This is + * essentially trying to guess if any subsequent smt_acquire() will fail, by + * peeking at the sibling CPU's state. The peek is racy, but if we get things + * wrong, the "only" consequence is that smt_acquire() may lose. + * + * smt_adjust_cpu_score() + * + * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here, + * we'll add a small penalty to the score. This also makes sure a VCPU thread + * migration behaves properly. + * + * smt_init() / smt_late_init() + * + * Set up SMT handling. If smt_boot_disable is set, smt_late_init(), which runs + * late enough to be able to do so, will offline and mark CPU_DISABLED all the + * siblings. smt_disable() can also be called after boot via psradm -Ha. + */ + +#include <sys/archsystm.h> +#include <sys/disp.h> +#include <sys/cmt.h> +#include <sys/systm.h> +#include <sys/cpu.h> +#include <sys/var.h> +#include <sys/xc_levels.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/x86_archext.h> +#include <sys/esunddi.h> +#include <sys/promif.h> +#include <sys/policy.h> +#include <sys/smt.h> + +#define CS_SHIFT (8) +#define CS_MASK ((1 << CS_SHIFT) - 1) +#define CS_MARK(s) ((s) & CS_MASK) +#define CS_ZONE(s) ((s) >> CS_SHIFT) +#define CS_MK(s, z) ((s) | (z << CS_SHIFT)) + +typedef enum cs_mark { + CM_IDLE = 0, /* running CPU idle thread */ + CM_THREAD, /* running general non-VCPU thread */ + CM_UNSAFE, /* running ->t_unsafe thread */ + CM_VCPU, /* running VCPU thread */ + CM_POISONED /* running in guest */ +} cs_mark_t; + +/* Double-check our false-sharing padding. */ +CTASSERT(offsetof(cpu_smt_t, cs_sib) == 64); +CTASSERT(CM_IDLE == 0); +CTASSERT(CM_POISONED < (1 << CS_SHIFT)); +CTASSERT(CM_POISONED > CM_VCPU); +CTASSERT(CM_VCPU > CM_UNSAFE); + +static uint_t empty_pil = XC_CPUPOKE_PIL; + +/* + * If disabled, no SMT exclusion is performed, and system is potentially + * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not + * vulnerable" CPUID bit. + */ +int smt_exclusion = 1; + +/* + * How long smt_acquire() will spin trying to acquire the core, in + * micro-seconds. This is enough time to wait out a significant proportion of + * interrupts. + */ +clock_t smt_acquire_wait_time = 64; + +/* + * Did we request a disable of SMT at boot time? + */ +int smt_boot_disable; + +/* + * Whether SMT is enabled. + */ +int smt_enabled = 1; + +/* + * We're adding an interrupt handler of some kind at the given PIL. If this + * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our + * pil_needs_kick() optimization, as there is now potentially an unsafe + * interrupt handler at that PIL. This typically won't occur, so we're not that + * careful about what's actually getting added, which CPU it's on, or if it gets + * removed. This also presumes that softints can't cover our empty_pil. + */ +void +smt_intr_alloc_pil(uint_t pil) +{ + ASSERT(pil <= PIL_MAX); + + if (empty_pil == pil) + empty_pil = PIL_MAX + 1; +} + +/* + * If our sibling is also a VCPU thread from a different zone, we need one of + * them to give up, otherwise they will just battle each other for exclusion + * until they exhaust their quantum. + * + * We arbitrate between them by dispatch priority: clearly, a higher-priority + * thread deserves to win the acquisition. However, under CPU load, it'll be + * very common to see both threads with ->t_pri == 1. If so, we'll break the + * tie by cpu_id (which is hopefully arbitrary enough). + * + * If we lose, the VMM code will take this as a hint to call + * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread + * somewhere else. + * + * Note that all of this state examination is racy, as we don't own any locks + * here. + */ +static boolean_t +yield_to_vcpu(cpu_t *sib, zoneid_t zoneid) +{ + cpu_smt_t *sibsmt = &sib->cpu_m.mcpu_smt; + uint64_t sibstate = sibsmt->cs_state; + + /* + * If we're likely just waiting for an interrupt, don't yield. + */ + if (sibsmt->cs_intr_depth != 0) + return (B_FALSE); + + /* + * We're only interested in VCPUs from a different zone. + */ + if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid) + return (B_FALSE); + + if (curthread->t_pri < sib->cpu_dispatch_pri) + return (B_TRUE); + + if (curthread->t_pri == sib->cpu_dispatch_pri && + CPU->cpu_id < sib->cpu_id) + return (B_TRUE); + + return (B_FALSE); +} + +static inline boolean_t +sibling_compatible(cpu_smt_t *sibsmt, zoneid_t zoneid) +{ + uint64_t sibstate = sibsmt->cs_state; + + if (sibsmt->cs_intr_depth != 0) + return (B_FALSE); + + if (CS_MARK(sibstate) == CM_UNSAFE) + return (B_FALSE); + + if (CS_MARK(sibstate) == CM_IDLE) + return (B_TRUE); + + return (CS_ZONE(sibstate) == zoneid); +} + +int +smt_acquire(void) +{ + clock_t wait = smt_acquire_wait_time; + cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt; + zoneid_t zoneid = getzoneid(); + cpu_smt_t *sibsmt; + int ret = 0; + + ASSERT(!interrupts_enabled()); + + if (smt->cs_sib == NULL) { + /* For the "sequential" L1TF case. */ + spec_uarch_flush(); + return (1); + } + + sibsmt = &smt->cs_sib->cpu_m.mcpu_smt; + + /* A VCPU thread should never change zone. */ + ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid); + ASSERT3U(CS_MARK(smt->cs_state), ==, CM_VCPU); + ASSERT3U(curthread->t_preempt, >=, 1); + ASSERT(curthread->t_schedflag & TS_VCPU); + + while (ret == 0 && wait > 0) { + + if (yield_to_vcpu(smt->cs_sib, zoneid)) { + ret = -1; + break; + } + + if (sibling_compatible(sibsmt, zoneid)) { + lock_set(&sibsmt->cs_lock); + + if (sibling_compatible(sibsmt, zoneid)) { + smt->cs_state = CS_MK(CM_POISONED, zoneid); + sibsmt->cs_sibstate = CS_MK(CM_POISONED, + zoneid); + membar_enter(); + ret = 1; + } + + lock_clear(&sibsmt->cs_lock); + } else { + drv_usecwait(10); + wait -= 10; + } + } + + DTRACE_PROBE4(smt__acquire, int, ret, uint64_t, sibsmt->cs_state, + uint64_t, sibsmt->cs_intr_depth, clock_t, wait); + + if (ret == 1) + spec_uarch_flush(); + + return (ret); +} + +void +smt_release(void) +{ + cpu_smt_t *smt = &CPU->cpu_m.mcpu_smt; + zoneid_t zoneid = getzoneid(); + cpu_smt_t *sibsmt; + + ASSERT(!interrupts_enabled()); + + if (smt->cs_sib == NULL) + return; + + ASSERT3U(CS_ZONE(smt->cs_state), ==, zoneid); + ASSERT3U(CS_MARK(smt->cs_state), ==, CM_POISONED); + ASSERT3U(curthread->t_preempt, >=, 1); + + sibsmt = &smt->cs_sib->cpu_m.mcpu_smt; + + lock_set(&sibsmt->cs_lock); + + smt->cs_state = CS_MK(CM_VCPU, zoneid); + sibsmt->cs_sibstate = CS_MK(CM_VCPU, zoneid); + membar_producer(); + + lock_clear(&sibsmt->cs_lock); +} + +static void +smt_kick(cpu_smt_t *smt, zoneid_t zoneid) +{ + uint64_t sibstate; + + ASSERT(LOCK_HELD(&smt->cs_lock)); + ASSERT(!interrupts_enabled()); + + poke_cpu(smt->cs_sib->cpu_id); + + membar_consumer(); + sibstate = smt->cs_sibstate; + + if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid) + return; + + lock_clear(&smt->cs_lock); + + /* + * Spin until we can see the sibling has been kicked out or is otherwise + * OK. + */ + for (;;) { + membar_consumer(); + sibstate = smt->cs_sibstate; + + if (CS_MARK(sibstate) != CM_POISONED || + CS_ZONE(sibstate) == zoneid) + break; + + SMT_PAUSE(); + } + + lock_set(&smt->cs_lock); +} + +static boolean_t +pil_needs_kick(uint_t pil) +{ + return (pil != empty_pil); +} + +void +smt_begin_intr(uint_t pil) +{ + ulong_t flags; + cpu_smt_t *smt; + + ASSERT(pil <= PIL_MAX); + + flags = intr_clear(); + smt = &CPU->cpu_m.mcpu_smt; + + if (smt->cs_sib == NULL) { + intr_restore(flags); + return; + } + + if (atomic_inc_64_nv(&smt->cs_intr_depth) == 1 && pil_needs_kick(pil)) { + lock_set(&smt->cs_lock); + + membar_consumer(); + + if (CS_MARK(smt->cs_sibstate) == CM_POISONED) + smt_kick(smt, GLOBAL_ZONEID); + + lock_clear(&smt->cs_lock); + } + + intr_restore(flags); +} + +void +smt_end_intr(void) +{ + ulong_t flags; + cpu_smt_t *smt; + + flags = intr_clear(); + smt = &CPU->cpu_m.mcpu_smt; + + if (smt->cs_sib == NULL) { + intr_restore(flags); + return; + } + + ASSERT3U(smt->cs_intr_depth, >, 0); + atomic_dec_64(&smt->cs_intr_depth); + + intr_restore(flags); +} + +static inline boolean_t +smt_need_kick(cpu_smt_t *smt, zoneid_t zoneid) +{ + membar_consumer(); + + if (CS_MARK(smt->cs_sibstate) != CM_POISONED) + return (B_FALSE); + + if (CS_MARK(smt->cs_state) == CM_UNSAFE) + return (B_TRUE); + + return (CS_ZONE(smt->cs_sibstate) != zoneid); +} + +void +smt_mark(void) +{ + zoneid_t zoneid = getzoneid(); + kthread_t *t = curthread; + ulong_t flags; + cpu_smt_t *smt; + cpu_t *cp; + + flags = intr_clear(); + + cp = CPU; + smt = &cp->cpu_m.mcpu_smt; + + if (smt->cs_sib == NULL) { + intr_restore(flags); + return; + } + + lock_set(&smt->cs_lock); + + /* + * If we were a nested interrupt and went through the resume_from_intr() + * path, we can now be resuming to a pinning interrupt thread; in which + * case, skip marking, until we later resume to a "real" thread. + */ + if (smt->cs_intr_depth > 0) { + ASSERT3P(t->t_intr, !=, NULL); + + if (smt_need_kick(smt, zoneid)) + smt_kick(smt, zoneid); + goto out; + } + + if (t == t->t_cpu->cpu_idle_thread) { + ASSERT3U(zoneid, ==, GLOBAL_ZONEID); + smt->cs_state = CS_MK(CM_IDLE, zoneid); + } else { + uint64_t state = CM_THREAD; + + if (t->t_unsafe) + state = CM_UNSAFE; + else if (t->t_schedflag & TS_VCPU) + state = CM_VCPU; + + smt->cs_state = CS_MK(state, zoneid); + + if (smt_need_kick(smt, zoneid)) + smt_kick(smt, zoneid); + } + +out: + membar_producer(); + lock_clear(&smt->cs_lock); + intr_restore(flags); +} + +void +smt_begin_unsafe(void) +{ + curthread->t_unsafe++; + smt_mark(); +} + +void +smt_end_unsafe(void) +{ + ASSERT3U(curthread->t_unsafe, >, 0); + curthread->t_unsafe--; + smt_mark(); +} + +void +smt_mark_as_vcpu(void) +{ + thread_lock(curthread); + curthread->t_schedflag |= TS_VCPU; + smt_mark(); + thread_unlock(curthread); +} + +boolean_t +smt_should_run(kthread_t *t, cpu_t *cp) +{ + uint64_t sibstate; + cpu_t *sib; + + if (t == t->t_cpu->cpu_idle_thread) + return (B_TRUE); + + if ((sib = cp->cpu_m.mcpu_smt.cs_sib) == NULL) + return (B_TRUE); + + sibstate = sib->cpu_m.mcpu_smt.cs_state; + + if ((t->t_schedflag & TS_VCPU)) { + if (CS_MARK(sibstate) == CM_IDLE) + return (B_TRUE); + if (CS_MARK(sibstate) == CM_UNSAFE) + return (B_FALSE); + return (CS_ZONE(sibstate) == ttozone(t)->zone_id); + } + + if (CS_MARK(sibstate) < CM_VCPU) + return (B_TRUE); + + return (CS_ZONE(sibstate) == ttozone(t)->zone_id); +} + +pri_t +smt_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score) +{ + if (smt_should_run(t, cp)) + return (score); + + /* + * If we're a VCPU thread scoring our current CPU, we are most likely + * asking to be rescheduled elsewhere after losing smt_acquire(). In + * this case, the current CPU is not a good choice, most likely, and we + * should go elsewhere. + */ + if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0) + return ((v.v_maxsyspri + 1) * 2); + + return (score + 1); +} + +static void +set_smt_prop(void) +{ + (void) e_ddi_prop_update_string(DDI_DEV_T_NONE, ddi_root_node(), + "smt_enabled", smt_enabled ? "true" : "false"); +} + +static cpu_t * +smt_find_sibling(cpu_t *cp) +{ + for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) { + pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i); + group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus; + + if (pg->cmt_pg.pghw_hw != PGHW_IPIPE) + continue; + + if (GROUP_SIZE(cg) == 1) + break; + + if (GROUP_SIZE(cg) != 2) { + panic("%u SMT threads unsupported", GROUP_SIZE(cg)); + } + + if (GROUP_ACCESS(cg, 0) != cp) + return (GROUP_ACCESS(cg, 0)); + + VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp); + + return (GROUP_ACCESS(cg, 1)); + } + + return (NULL); +} + +/* + * Offline all siblings and mark as CPU_DISABLED. Note that any siblings that + * can't be offlined (if it would leave an empty partition, or it's a spare, or + * whatever) will fail the whole operation. + */ +int +smt_disable(void) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (secpolicy_ponline(CRED()) != 0) + return (EPERM); + + if (!smt_enabled) + return (0); + + for (size_t i = 0; i < NCPU; i++) { + cpu_t *sib; + cpu_t *cp; + + if ((cp = cpu_get(i)) == NULL) + continue; + + /* NB: we don't necessarily have .mcpu_smt to use here. */ + if ((sib = smt_find_sibling(cp)) == NULL) + continue; + + if (cp->cpu_id < sib->cpu_id) + continue; + + if (cp->cpu_flags & CPU_DISABLED) { + VERIFY(cp->cpu_flags & CPU_OFFLINE); + continue; + } + + if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) { + error = EINVAL; + break; + } + + if ((cp->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY) { + cp->cpu_flags |= CPU_DISABLED; + continue; + } + + if ((error = cpu_offline(cp, CPU_FORCED)) != 0) + break; + + cp->cpu_flags |= CPU_DISABLED; + cpu_set_state(cp); + } + + if (error != 0) + return (error); + + smt_enabled = 0; + set_smt_prop(); + cmn_err(CE_NOTE, "!SMT / hyper-threading explicitly disabled."); + return (0); +} + +boolean_t +smt_can_enable(cpu_t *cp, int flags) +{ + VERIFY(cp->cpu_flags & CPU_DISABLED); + + return (!smt_boot_disable && (flags & CPU_FORCED)); +} + +/* + * If we force-onlined a CPU_DISABLED CPU, then we can no longer consider the + * system to be SMT-disabled in toto. + */ +void +smt_force_enabled(void) +{ + VERIFY(!smt_boot_disable); + + if (!smt_enabled) + cmn_err(CE_NOTE, "!Disabled SMT sibling forced on-line."); + + smt_enabled = 1; + set_smt_prop(); +} + +/* + * Initialize SMT links. We have to be careful here not to race with + * smt_begin/end_intr(), which also complicates trying to do this initialization + * from a cross-call; hence the slightly odd approach below. + * + * If we're going to disable SMT via smt_late_init(), we will avoid paying the + * price here at all (we can't do it here since we're still too early in + * main()). + */ +void +smt_init(void) +{ + boolean_t found_sibling = B_FALSE; + cpu_t *scp = CPU; + cpu_t *cp = scp; + ulong_t flags; + + if (!smt_exclusion || smt_boot_disable) + return; + + mutex_enter(&cpu_lock); + + do { + thread_affinity_set(curthread, cp->cpu_id); + flags = intr_clear(); + + cp->cpu_m.mcpu_smt.cs_intr_depth = 0; + cp->cpu_m.mcpu_smt.cs_state = CS_MK(CM_THREAD, GLOBAL_ZONEID); + cp->cpu_m.mcpu_smt.cs_sibstate = CS_MK(CM_THREAD, + GLOBAL_ZONEID); + ASSERT3P(cp->cpu_m.mcpu_smt.cs_sib, ==, NULL); + cp->cpu_m.mcpu_smt.cs_sib = smt_find_sibling(cp); + + if (cp->cpu_m.mcpu_smt.cs_sib != NULL) + found_sibling = B_TRUE; + + intr_restore(flags); + thread_affinity_clear(curthread); + } while ((cp = cp->cpu_next_onln) != scp); + + mutex_exit(&cpu_lock); + + if (!found_sibling) + smt_enabled = 0; +} + +void +smt_late_init(void) +{ + if (smt_boot_disable) { + int err; + + mutex_enter(&cpu_lock); + + err = smt_disable(); + + /* + * We're early enough in boot that nothing should have stopped + * us from offlining the siblings. As we didn't prepare our + * L1TF mitigation in this case, we need to panic. + */ + if (err) { + cmn_err(CE_PANIC, "smt_disable() failed with %d", err); + } + + mutex_exit(&cpu_lock); + } + + if (smt_enabled) + cmn_err(CE_NOTE, "!SMT enabled\n"); + + set_smt_prop(); +} diff --git a/usr/src/uts/intel/ppt/Makefile b/usr/src/uts/intel/ppt/Makefile new file mode 100644 index 0000000000..fe9e0026c1 --- /dev/null +++ b/usr/src/uts/intel/ppt/Makefile @@ -0,0 +1,63 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. +# Copyright 2022 Oxide Computer Company +# + +UTSBASE = ../.. + +MODULE = ppt +OBJECTS = $(PPT_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/vmm/io +MAPFILE = $(UTSBASE)/intel/io/vmm/io/ppt.mapfile + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + +PRE_INC_PATH = \ + -I$(COMPAT)/bhyve \ + -I$(COMPAT)/bhyve/amd64 \ + -I$(CONTRIB)/bhyve \ + -I$(CONTRIB)/bhyve/amd64 + +INC_PATH += -I$(UTSBASE)/intel/io/vmm -I$(UTSBASE)/intel/io/vmm/io +AS_INC_PATH += -I$(UTSBASE)/intel/io/vmm -I$(OBJS_DIR) + +LDFLAGS += -N drv/vmm -N misc/pcie +LDFLAGS += -M $(MAPFILE) + +$(OBJS_DIR)/ppt.o := CERRWARN += -_gcc=-Wno-unused-variable + +# needs work +SMOFF += all_func_returns + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ +include $(UTSBASE)/intel/io/vmm/Makefile.rules diff --git a/usr/src/uts/intel/sys/Makefile b/usr/src/uts/intel/sys/Makefile index 5cfbdec4fc..0a6af2de2d 100644 --- a/usr/src/uts/intel/sys/Makefile +++ b/usr/src/uts/intel/sys/Makefile @@ -91,6 +91,12 @@ HDRS = \ x86_archext.h \ xen_errno.h + +# Headers shared with the various machine architectures are installed via +# different means, but are checked here, since it is a common point. +include Makefile.psm +CHECK_ONLY_HDRS = $(PSM_SHARED_HDRS) + ROOTDIR= $(ROOT)/usr/include/sys SCSIDIR= $(ROOTDIR)/scsi SCSIDIRS= $(SCSIDIR) $(SCSIDIR)/conf $(SCSIDIR)/generic \ @@ -100,8 +106,9 @@ ROOTDIRS= $(ROOTDIR) $(ROOTFSDIR) ROOTHDRS= $(HDRS:%=$(ROOTDIR)/%) -CHECKHDRS= \ - $(HDRS:%.h=%.check) +CHECKHDRS = \ + $(HDRS:%.h=%.check) \ + $(CHECK_ONLY_HDRS:%.h=%.check) \ # install rules $(ROOTDIR)/%: % diff --git a/usr/src/uts/intel/sys/Makefile.psm b/usr/src/uts/intel/sys/Makefile.psm new file mode 100644 index 0000000000..8fecc14f49 --- /dev/null +++ b/usr/src/uts/intel/sys/Makefile.psm @@ -0,0 +1,27 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2022 Oxide Computer Company +# + +# HMA and SMT-exclusion headers are used by and exposed (via system-header) by +# the various machine architectures. + +PSM_SHARED_HDRS = \ + hma.h \ + smt.h \ + smt_machcpu.h + +PSM_SHARED_HDR_DIR = $(UTSBASE)/intel/sys + +$(USR_PSM_ISYS_DIR)/%: $(PSM_SHARED_HDR_DIR)/% $(USR_PSM_ISYS_DIR) + $(INS.file) diff --git a/usr/src/uts/intel/sys/hma.h b/usr/src/uts/intel/sys/hma.h new file mode 100644 index 0000000000..e15cd60d5e --- /dev/null +++ b/usr/src/uts/intel/sys/hma.h @@ -0,0 +1,178 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ + +#ifndef _SYS_HMA_H +#define _SYS_HMA_H + +/* + * Hypervisor Multiplexor API + * + * This provides a set of APIs that are usable by hypervisor implementations + * that allows them to coexist and to make sure that they are all in a + * consistent state. + */ + +#include <sys/fp.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Register a hypervisor with HMA. On success, a pointer to the opaque + * registration token will be returned, indicating that proper host setup has + * occurred for further hypervisor actions. + */ +typedef struct hma_reg hma_reg_t; +extern hma_reg_t *hma_register(const char *); +extern hma_reg_t *hma_register_exclusive(const char *); +extern void hma_unregister(hma_reg_t *); + +/* + * Allocate or free a VPID for use with VMX. + * + * This must not be performed by a hypervisor until it has successfully + * registered via hma_register(). + */ +extern uint16_t hma_vmx_vpid_alloc(void); +extern void hma_vmx_vpid_free(uint16_t); + +/* + * On all active CPUs, perform a single-context INVEPT on the given EPTP. + */ +extern void hma_vmx_invept_allcpus(uintptr_t); + +struct hma_svm_asid { + uint64_t hsa_gen; + uint32_t hsa_asid; +}; +typedef struct hma_svm_asid hma_svm_asid_t; + +extern void hma_svm_asid_init(hma_svm_asid_t *); +extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t); + +/* + * FPU related management. These functions provide a set of APIs to manage the + * FPU state and switch between host and guest management of this state. + */ + +typedef struct hma_fpu hma_fpu_t; + +/* + * Allocate and free FPU state management structures. + */ +extern hma_fpu_t *hma_fpu_alloc(int); +extern void hma_fpu_free(hma_fpu_t *); + +/* + * Resets the FPU to the standard x86 default state. This should be called after + * allocation and whenever the guest needs to logically reset the state (when + * the CPU is reset, etc.). If the system supports xsave, then the xbv state + * will be set to have the x87 and SSE portions as valid and the rest will be + * set to their initial states (regardless of whether or not they will be + * advertised in the host). + */ +extern int hma_fpu_init(hma_fpu_t *); + +/* + * Save the current host's FPU state and restore the guest's state in the FPU. + * At this point, CR0.TS will not be set. The caller must not use the FPU in any + * way before entering the guest. + * + * This should be used in normal operation before entering the guest. It should + * also be used in a thread context operation when the thread is being scheduled + * again. This interface has an implicit assumption that a given guest state + * will be mapped to only one specific OS thread at any given time. + * + * This must be called with preemption disabled. + */ +extern void hma_fpu_start_guest(hma_fpu_t *); + +/* + * Save the current guest's FPU state and restore the host's state in the FPU. + * By the time the thread returns to userland, the FPU will be in a usable + * state; however, the FPU will not be usable while inside the kernel (CR0.TS + * will be set). + * + * This should be used in normal operation after leaving the guest and returning + * to user land. It should also be used in a thread context operation when the + * thread is being descheduled. Like the hma_fpu_start_guest() interface, this + * interface has an implicit assumption that a given guest state will be mapped + * to only a single OS thread at any given time. + * + * This must be called with preemption disabled. + */ +extern void hma_fpu_stop_guest(hma_fpu_t *); + +typedef enum { + HFXR_OK = 0, + HFXR_NO_SPACE, /* buffer is not large enough */ + HFXR_BAD_ALIGN, /* buffer is not properly (64-byte) aligned */ + HFXR_UNSUP_FMT, /* data using unsupported (compressed) format */ + HFXR_UNSUP_FEAT, /* data has unsupported features set */ + HFXR_INVALID_DATA, /* CPU determined xsave data is invalid */ +} hma_fpu_xsave_result_t; + +/* + * Get and set the contents of the FPU save area, formatted as XSAVE-style + * information. If XSAVE is not supported by the host, the input and output + * values will be translated to and from the FXSAVE format. Attempts to set + * XSAVE values not supported by the host will result in an error. + * + * These functions cannot be called while the FPU is in use by the guest. It is + * up to callers to guarantee this invariant. + */ +extern hma_fpu_xsave_result_t hma_fpu_get_xsave_state(const hma_fpu_t *, void *, + size_t); +extern hma_fpu_xsave_result_t hma_fpu_set_xsave_state(hma_fpu_t *, void *, + size_t); + +typedef struct hma_xsave_state_desc { + uint64_t hxsd_bit; + uint32_t hxsd_size; + uint32_t hxsd_off; +} hma_xsave_state_desc_t; + +/* + * Get a description of the data fields supported by the host via the XSAVE APIs + * for getting/setting guest FPU data. See the function definition for more + * detailed parameter usage. + */ +extern uint_t hma_fpu_describe_xsave_state(hma_xsave_state_desc_t *, uint_t, + size_t *); + +/* + * Get and set the contents of the FPU save area. This sets the fxsave style + * information. In all cases when this is in use, if an XSAVE state is actually + * used by the host, then this will end up zeroing all of the non-fxsave state + * and it will reset the xbv to indicate that the legacy x87 and SSE portions + * are valid. + * + * These functions cannot be called while the FPU is in use by the guest. It is + * up to callers to guarantee this fact. + */ +extern void hma_fpu_get_fxsave_state(const hma_fpu_t *, struct fxsave_state *); +extern int hma_fpu_set_fxsave_state(hma_fpu_t *, const struct fxsave_state *); + +/* Perform HMA initialization steps during boot-up. */ +extern void hma_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_HMA_H */ diff --git a/usr/src/uts/intel/sys/ppt_dev.h b/usr/src/uts/intel/sys/ppt_dev.h new file mode 100644 index 0000000000..a7b65ad0dd --- /dev/null +++ b/usr/src/uts/intel/sys/ppt_dev.h @@ -0,0 +1,57 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2018 Joyent, Inc + */ + +#ifndef _PPT_DEV_H +#define _PPT_DEV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define PPT_IOC (('P' << 16)|('T' << 8)) + +#define PPT_CFG_READ (PPT_IOC | 0x01) +#define PPT_CFG_WRITE (PPT_IOC | 0x02) +#define PPT_BAR_QUERY (PPT_IOC | 0x03) +#define PPT_BAR_READ (PPT_IOC | 0x04) +#define PPT_BAR_WRITE (PPT_IOC | 0x05) + +#define PPT_MAXNAMELEN 32 + +struct ppt_cfg_io { + uint64_t pci_off; + uint32_t pci_width; + uint32_t pci_data; +}; +struct ppt_bar_io { + uint32_t pbi_bar; + uint32_t pbi_off; + uint32_t pbi_width; + uint32_t pbi_data; +}; + +struct ppt_bar_query { + uint32_t pbq_baridx; + uint32_t pbq_type; + uint64_t pbq_base; + uint64_t pbq_size; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _PPT_DEV_H */ diff --git a/usr/src/uts/intel/sys/smt.h b/usr/src/uts/intel/sys/smt.h new file mode 100644 index 0000000000..f539d13799 --- /dev/null +++ b/usr/src/uts/intel/sys/smt.h @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_SMT_H +#define _SYS_SMT_H + +#include <sys/types.h> +#include <sys/thread.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct cpu; + +extern int smt_boot_disable; + +extern void smt_init(void); +extern void smt_late_init(void); +extern int smt_disable(void); +extern boolean_t smt_can_enable(struct cpu *, int); +extern void smt_force_enabled(void); + +extern void smt_intr_alloc_pil(uint_t); + +extern int smt_acquire(void); +extern void smt_release(void); +extern void smt_mark(void); +extern void smt_begin_unsafe(void); +extern void smt_end_unsafe(void); +extern void smt_begin_intr(uint_t); +extern void smt_end_intr(void); +extern void smt_mark_as_vcpu(void); + +extern boolean_t smt_should_run(kthread_t *, struct cpu *); +extern pri_t smt_adjust_cpu_score(kthread_t *, struct cpu *, pri_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SMT_H */ diff --git a/usr/src/uts/intel/sys/smt_machcpu.h b/usr/src/uts/intel/sys/smt_machcpu.h new file mode 100644 index 0000000000..a8fcd8621b --- /dev/null +++ b/usr/src/uts/intel/sys/smt_machcpu.h @@ -0,0 +1,44 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ + +#ifndef _SYS_SMT_MACHCPU_H +#define _SYS_SMT_MACHCPU_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The SMT exclusion logic requires `struct cpu_smt` be present in + * `struct machcpu` as the field `mcpu_smt`. It is defined here, on its own, so + * it may be easily included by the relevant machine architecture(s). + */ +typedef struct cpu_smt { + lock_t cs_lock; + char cs_pad[56]; + struct cpu *cs_sib; + volatile uint64_t cs_intr_depth; + volatile uint64_t cs_state; + volatile uint64_t cs_sibstate; +} cpu_smt_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SMT_MACHCPU_H */ diff --git a/usr/src/uts/intel/sys/viona_io.h b/usr/src/uts/intel/sys/viona_io.h new file mode 100644 index 0000000000..46cc72eb06 --- /dev/null +++ b/usr/src/uts/intel/sys/viona_io.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VIONA_IO_H_ +#define _VIONA_IO_H_ + +#define VNA_IOC (('V' << 16)|('C' << 8)) +#define VNA_IOC_CREATE (VNA_IOC | 0x01) +#define VNA_IOC_DELETE (VNA_IOC | 0x02) + +#define VNA_IOC_RING_INIT (VNA_IOC | 0x10) +#define VNA_IOC_RING_RESET (VNA_IOC | 0x11) +#define VNA_IOC_RING_KICK (VNA_IOC | 0x12) +#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13) +#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14) + +#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22) +#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23) + +typedef struct vioc_create { + datalink_id_t c_linkid; + int c_vmfd; +} vioc_create_t; + +typedef struct vioc_ring_init { + uint16_t ri_index; + uint16_t ri_qsize; + uint64_t ri_qaddr; +} vioc_ring_init_t; + +typedef struct vioc_ring_msi { + uint16_t rm_index; + uint64_t rm_addr; + uint64_t rm_msg; +} vioc_ring_msi_t; + +enum viona_vq_id { + VIONA_VQ_RX = 0, + VIONA_VQ_TX = 1, + VIONA_VQ_MAX = 2 +}; + +typedef struct vioc_intr_poll { + uint32_t vip_status[VIONA_VQ_MAX]; +} vioc_intr_poll_t; + + +#endif /* _VIONA_IO_H_ */ diff --git a/usr/src/uts/intel/sys/vmm.h b/usr/src/uts/intel/sys/vmm.h new file mode 100644 index 0000000000..e58d63761e --- /dev/null +++ b/usr/src/uts/intel/sys/vmm.h @@ -0,0 +1,392 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_DR6, + VM_REG_GUEST_ENTRY_INST_LENGTH, + VM_REG_LAST +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +/* + * illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does. + * To simplify structure definitions, an arbitrary limit has been chosen. + * This same limit is used for memory segment names + */ + +#define VM_MAX_NAMELEN 128 +#define VM_MAX_SEG_NAMELEN 128 + +#define VM_MAXCPU 32 /* maximum virtual cpus */ + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_ENABLE_INVPCID, + VM_CAP_BPT_EXIT, + VM_CAP_MAX +}; + +enum vmx_caps { + VMX_CAP_NONE = 0, + VMX_CAP_TPR_SHADOW = (1UL << 0), + VMX_CAP_APICV = (1UL << 1), + VMX_CAP_APICV_X2APIC = (1UL << 2), + VMX_CAP_APICV_PIR = (1UL << 3), +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_RUN_STATE, + VM_EXITCODE_MMIO_EMUL, + VM_EXITCODE_DEPRECATED, /* formerly RUNBLOCK */ + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_MMIO, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_REQIDLE, + VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, + VM_EXITCODE_BPT, + VM_EXITCODE_HT, + VM_EXITCODE_MAX +}; + +enum inout_flags { + INOUT_IN = (1U << 0), /* direction: 'in' when set, else 'out' */ + + /* + * The following flags are used only for in-kernel emulation logic and + * are not exposed to userspace. + */ + INOUT_STR = (1U << 1), /* ins/outs operation */ + INOUT_REP = (1U << 2), /* 'rep' prefix present on instruction */ +}; + +struct vm_inout { + uint32_t eax; + uint16_t port; + uint8_t bytes; /* 1 or 2 or 4 */ + uint8_t flags; /* see: inout_flags */ + + /* + * The address size and segment are relevant to INS/OUTS operations. + * Userspace is not concerned with them since the in-kernel emulation + * handles those specific aspects. + */ + uint8_t addrsize; + uint8_t segment; +}; + +struct vm_mmio { + uint8_t bytes; /* 1/2/4/8 bytes */ + uint8_t read; /* read: 1, write: 0 */ + uint16_t _pad[3]; + uint64_t gpa; + uint64_t data; +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + +enum vcpu_run_state { + VRS_HALT = 0, + VRS_INIT = (1 << 0), + VRS_RUN = (1 << 1), + + VRS_PEND_INIT = (1 << 14), + VRS_PEND_SIPI = (1 << 15), +}; +#define VRS_MASK_VALID(v) \ + ((v) & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI | VRS_PEND_SIPI)) +#define VRS_IS_VALID(v) ((v) == VRS_MASK_VALID(v)) + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_mmio mmio; + struct { + uint64_t gpa; + int fault_type; + } paging; + /* + * Kernel-internal MMIO decoding and emulation. + * Userspace should not expect to see this, but rather a + * VM_EXITCODE_MMIO with the above 'mmio' context. + */ + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + } mmio_emul; + struct { + uint8_t inst[15]; + uint8_t num_valid; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { + int inst_length; + } bpt; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + uint64_t rflags; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +enum vm_entry_cmds { + VEC_DEFAULT = 0, + VEC_DISCARD_INSTR, /* discard inst emul state */ + VEC_FULFILL_MMIO, /* entry includes result for mmio emul */ + VEC_FULFILL_INOUT, /* entry includes result for inout emul */ +}; + +struct vm_entry { + int cpuid; + uint_t cmd; /* see: vm_entry_cmds */ + void *exit_data; + union { + struct vm_inout inout; + struct vm_mmio mmio; + } u; +}; + +int vm_restart_instruction(void *vm, int vcpuid); + +enum vm_create_flags { + /* + * Allocate guest memory segments from existing reservoir capacity, + * rather than attempting to create transient allocations. + */ + VCF_RESERVOIR_MEM = (1 << 0), +}; + +#endif /* _VMM_H_ */ diff --git a/usr/src/uts/intel/sys/vmm_dev.h b/usr/src/uts/intel/sys/vmm_dev.h new file mode 100644 index 0000000000..027a7da214 --- /dev/null +++ b/usr/src/uts/intel/sys/vmm_dev.h @@ -0,0 +1,458 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#include <machine/vmm.h> + +#include <sys/param.h> +#include <sys/cpuset.h> + +struct vm_create_req { + char name[VM_MAX_NAMELEN]; + uint64_t flags; +}; + + +struct vm_destroy_req { + char name[VM_MAX_NAMELEN]; +}; + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 + +struct vm_munmap { + vm_paddr_t gpa; + size_t len; +}; + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[VM_MAX_SEG_NAMELEN]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_lapic_msi { + uint64_t msg; + uint64_t addr; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_ioapic_irq { + int irq; +}; + +struct vm_isa_irq { + int atpic_irq; + int ioapic_irq; +}; + +struct vm_isa_irq_trigger { + int atpic_irq; + enum vm_intr_trigger trigger; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int pptfd; +}; + +struct vm_pptdev_mmio { + int pptfd; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int pptfd; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int pptfd; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_pptdev_limits { + int pptfd; + int msi_limit; + int msix_limit; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS (64 + VM_MAXCPU) + +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + +struct vm_hpet_cap { + uint32_t capabilities; /* lower 32 bits of HPET capabilities */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +#define VM_REINIT_F_FORCE_SUSPEND (1 << 0) + +struct vm_reinit { + uint64_t flags; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; +#ifndef _KERNEL + cpuset_t *cpus; +#else + void *cpus; +#endif +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +struct vm_devmem_offset { + int segid; + off_t offset; +}; + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +struct vm_readwrite_kernemu_device { + int vcpuid; + unsigned access_width : 3; + unsigned _unused : 29; + uint64_t gpa; + uint64_t value; +}; +_Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); + +enum vcpu_reset_kind { + VRK_RESET = 0, + /* + * The reset performed by an INIT IPI clears much of the CPU state, but + * some portions are left untouched, unlike VRK_RESET, which represents + * a "full" reset as if the system was freshly powered on. + */ + VRK_INIT = 1, +}; + +struct vm_vcpu_reset { + int vcpuid; + uint32_t kind; /* contains: enum vcpu_reset_kind */ +}; + +struct vm_run_state { + int vcpuid; + uint32_t state; /* of enum cpu_init_status type */ + uint8_t sipi_vector; /* vector of SIPI, if any */ + uint8_t _pad[3]; +}; + +/* Transfer data for VM_GET_FPU and VM_SET_FPU */ +struct vm_fpu_state { + int vcpuid; + void *buf; + size_t len; +}; + +struct vm_fpu_desc_entry { + uint64_t vfde_feature; + uint32_t vfde_size; + uint32_t vfde_off; +}; + +struct vm_fpu_desc { + struct vm_fpu_desc_entry *vfd_entry_data; + size_t vfd_req_size; + uint32_t vfd_num_entries; +}; + +struct vmm_resv_query { + size_t vrq_free_sz; + size_t vrq_alloc_sz; + size_t vrq_alloc_transient_sz; + size_t vrq_limit; +}; + +/* + * struct vmm_dirty_tracker is used for tracking dirty guest pages during + * e.g. live migration. + * + * - The `vdt_start_gpa` field specifies the offset from the beginning of + * guest physical memory to track; + * - `vdt_pfns` points to a bit vector indexed by guest PFN relative to the + * given start address. Each bit indicates whether the given guest page + * is dirty or not. + * - `vdt_pfns_len` specifies the length of the of the guest physical memory + * region in bytes. It also de facto bounds the range of guest addresses + * we will examine on any one `VM_TRACK_DIRTY_PAGES` ioctl(). If the + * range of the bit vector spans an unallocated region (or extends beyond + * the end of the guest physical address space) the corresponding bits in + * `vdt_pfns` will be zeroed. + */ +struct vmm_dirty_tracker { + uint64_t vdt_start_gpa; + size_t vdt_len; /* length of region */ + void *vdt_pfns; /* bit vector of dirty bits */ +}; + +#define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8)) +#define VMM_IOC_BASE (('v' << 16) | ('m' << 8)) +#define VMM_LOCK_IOC_BASE (('v' << 16) | ('l' << 8)) +#define VMM_CPU_IOC_BASE (('v' << 16) | ('p' << 8)) + +/* Operations performed on the vmmctl device */ +#define VMM_CREATE_VM (VMMCTL_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMMCTL_IOC_BASE | 0x02) +#define VMM_VM_SUPPORTED (VMMCTL_IOC_BASE | 0x03) + +#define VMM_RESV_QUERY (VMMCTL_IOC_BASE | 0x10) +#define VMM_RESV_ADD (VMMCTL_IOC_BASE | 0x11) +#define VMM_RESV_REMOVE (VMMCTL_IOC_BASE | 0x12) + +/* Operations performed in the context of a given vCPU */ +#define VM_RUN (VMM_CPU_IOC_BASE | 0x01) +#define VM_SET_REGISTER (VMM_CPU_IOC_BASE | 0x02) +#define VM_GET_REGISTER (VMM_CPU_IOC_BASE | 0x03) +#define VM_SET_SEGMENT_DESCRIPTOR (VMM_CPU_IOC_BASE | 0x04) +#define VM_GET_SEGMENT_DESCRIPTOR (VMM_CPU_IOC_BASE | 0x05) +#define VM_SET_REGISTER_SET (VMM_CPU_IOC_BASE | 0x06) +#define VM_GET_REGISTER_SET (VMM_CPU_IOC_BASE | 0x07) +#define VM_INJECT_EXCEPTION (VMM_CPU_IOC_BASE | 0x08) +#define VM_SET_CAPABILITY (VMM_CPU_IOC_BASE | 0x09) +#define VM_GET_CAPABILITY (VMM_CPU_IOC_BASE | 0x0a) +#define VM_PPTDEV_MSI (VMM_CPU_IOC_BASE | 0x0b) +#define VM_PPTDEV_MSIX (VMM_CPU_IOC_BASE | 0x0c) +#define VM_SET_X2APIC_STATE (VMM_CPU_IOC_BASE | 0x0d) +#define VM_GLA2GPA (VMM_CPU_IOC_BASE | 0x0e) +#define VM_GLA2GPA_NOFAULT (VMM_CPU_IOC_BASE | 0x0f) +#define VM_ACTIVATE_CPU (VMM_CPU_IOC_BASE | 0x10) +#define VM_SET_INTINFO (VMM_CPU_IOC_BASE | 0x11) +#define VM_GET_INTINFO (VMM_CPU_IOC_BASE | 0x12) +#define VM_RESTART_INSTRUCTION (VMM_CPU_IOC_BASE | 0x13) +#define VM_SET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x14) +#define VM_GET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x15) +#define VM_RESET_CPU (VMM_CPU_IOC_BASE | 0x16) +#define VM_GET_RUN_STATE (VMM_CPU_IOC_BASE | 0x17) +#define VM_SET_RUN_STATE (VMM_CPU_IOC_BASE | 0x18) +#define VM_GET_FPU (VMM_CPU_IOC_BASE | 0x19) +#define VM_SET_FPU (VMM_CPU_IOC_BASE | 0x1a) + +/* Operations requiring write-locking the VM */ +#define VM_REINIT (VMM_LOCK_IOC_BASE | 0x01) +#define VM_BIND_PPTDEV (VMM_LOCK_IOC_BASE | 0x02) +#define VM_UNBIND_PPTDEV (VMM_LOCK_IOC_BASE | 0x03) +#define VM_MAP_PPTDEV_MMIO (VMM_LOCK_IOC_BASE | 0x04) +#define VM_ALLOC_MEMSEG (VMM_LOCK_IOC_BASE | 0x05) +#define VM_MMAP_MEMSEG (VMM_LOCK_IOC_BASE | 0x06) +#define VM_PMTMR_LOCATE (VMM_LOCK_IOC_BASE | 0x07) +#define VM_MUNMAP_MEMSEG (VMM_LOCK_IOC_BASE | 0x08) +#define VM_UNMAP_PPTDEV_MMIO (VMM_LOCK_IOC_BASE | 0x09) + +#define VM_WRLOCK_CYCLE (VMM_LOCK_IOC_BASE | 0xff) + +/* All other ioctls */ +#define VM_GET_GPA_PMAP (VMM_IOC_BASE | 0x01) +#define VM_GET_MEMSEG (VMM_IOC_BASE | 0x02) +#define VM_MMAP_GETNEXT (VMM_IOC_BASE | 0x03) + +#define VM_LAPIC_IRQ (VMM_IOC_BASE | 0x04) +#define VM_LAPIC_LOCAL_IRQ (VMM_IOC_BASE | 0x05) +#define VM_LAPIC_MSI (VMM_IOC_BASE | 0x06) + +#define VM_IOAPIC_ASSERT_IRQ (VMM_IOC_BASE | 0x07) +#define VM_IOAPIC_DEASSERT_IRQ (VMM_IOC_BASE | 0x08) +#define VM_IOAPIC_PULSE_IRQ (VMM_IOC_BASE | 0x09) + +#define VM_ISA_ASSERT_IRQ (VMM_IOC_BASE | 0x0a) +#define VM_ISA_DEASSERT_IRQ (VMM_IOC_BASE | 0x0b) +#define VM_ISA_PULSE_IRQ (VMM_IOC_BASE | 0x0c) +#define VM_ISA_SET_IRQ_TRIGGER (VMM_IOC_BASE | 0x0d) + +#define VM_RTC_WRITE (VMM_IOC_BASE | 0x0e) +#define VM_RTC_READ (VMM_IOC_BASE | 0x0f) +#define VM_RTC_SETTIME (VMM_IOC_BASE | 0x10) +#define VM_RTC_GETTIME (VMM_IOC_BASE | 0x11) + +#define VM_SUSPEND (VMM_IOC_BASE | 0x12) + +#define VM_IOAPIC_PINCOUNT (VMM_IOC_BASE | 0x13) +#define VM_GET_PPTDEV_LIMITS (VMM_IOC_BASE | 0x14) +#define VM_GET_HPET_CAPABILITIES (VMM_IOC_BASE | 0x15) + +#define VM_STATS_IOC (VMM_IOC_BASE | 0x16) +#define VM_STAT_DESC (VMM_IOC_BASE | 0x17) + +#define VM_INJECT_NMI (VMM_IOC_BASE | 0x18) +#define VM_GET_X2APIC_STATE (VMM_IOC_BASE | 0x19) +#define VM_SET_TOPOLOGY (VMM_IOC_BASE | 0x1a) +#define VM_GET_TOPOLOGY (VMM_IOC_BASE | 0x1b) +#define VM_GET_CPUS (VMM_IOC_BASE | 0x1c) +#define VM_SUSPEND_CPU (VMM_IOC_BASE | 0x1d) +#define VM_RESUME_CPU (VMM_IOC_BASE | 0x1e) + +#define VM_PPTDEV_DISABLE_MSIX (VMM_IOC_BASE | 0x1f) + +/* Note: forces a barrier on a flush operation before returning. */ +#define VM_TRACK_DIRTY_PAGES (VMM_IOC_BASE | 0x20) +#define VM_DESC_FPU_AREA (VMM_IOC_BASE | 0x21) + +#define VM_DEVMEM_GETOFFSET (VMM_IOC_BASE | 0xff) + +#define VMM_CTL_DEV "/dev/vmmctl" + +#endif diff --git a/usr/src/uts/intel/sys/vmm_drv.h b/usr/src/uts/intel/sys/vmm_drv.h new file mode 100644 index 0000000000..0b7f622e53 --- /dev/null +++ b/usr/src/uts/intel/sys/vmm_drv.h @@ -0,0 +1,70 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _VMM_DRV_H_ +#define _VMM_DRV_H_ + +#ifdef _KERNEL + +#include <sys/file.h> +#include <sys/stdbool.h> + +struct vmm_hold; +typedef struct vmm_hold vmm_hold_t; + +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + +/* + * This is effectively a synonym for the bhyve-internal 'struct vm_page' type. + * Use of `vmm_page_t *` instead allows us to keep those implementation details + * hidden from vmm_drv consumers. + */ +struct vmm_page; +typedef struct vmm_page vmm_page_t; + +/* + * Because of tangled headers, this definitions mirrors its ioport_handler_t + * counterpart in vmm_kernel.h. + */ +typedef int (*vmm_drv_iop_cb_t)(void *, bool, uint16_t, uint8_t, uint32_t *); + +extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); +extern void vmm_drv_rele(vmm_hold_t *); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern vmm_page_t *vmm_drv_page_hold(vmm_lease_t *, uintptr_t, int); +extern void vmm_drv_page_release(vmm_page_t *); +extern void vmm_drv_page_release_chain(vmm_page_t *); +extern const void *vmm_drv_page_readable(const vmm_page_t *); +extern void *vmm_drv_page_writable(const vmm_page_t *); +extern void vmm_drv_page_chain(vmm_page_t *, vmm_page_t *); +extern vmm_page_t *vmm_drv_page_next(const vmm_page_t *); + +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + +extern int vmm_drv_ioport_hook(vmm_hold_t *, uint16_t, vmm_drv_iop_cb_t, void *, + void **); +extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); +#endif /* _KERNEL */ + +#endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/intel/viona/Makefile b/usr/src/uts/intel/viona/Makefile new file mode 100644 index 0000000000..a09dbbe9e9 --- /dev/null +++ b/usr/src/uts/intel/viona/Makefile @@ -0,0 +1,54 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. +# Copyright 2022 Oxide Computer Company +# + +UTSBASE = ../.. + +MODULE = viona +OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/viona +MAPFILE = $(UTSBASE)/intel/io/viona/viona.mapfile + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# needs work +SMOFF += all_func_returns + +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + +CFLAGS += $(CCVERBOSE) +LDFLAGS += -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti +LDFLAGS += -Nmisc/hook +LDFLAGS += -M $(MAPFILE) + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ |