summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/bhyvectl/bhyvectl.c18
-rw-r--r--usr/src/lib/libvmmapi/common/mapfile-vers3
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.c13
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.h6
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c301
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c19
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.mapfile5
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c419
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c10
-rw-r--r--usr/src/uts/i86pc/sys/vmm_dev.h4
-rw-r--r--usr/src/uts/i86pc/sys/vmm_drv.h18
-rw-r--r--usr/src/uts/i86pc/sys/vmm_impl.h17
12 files changed, 622 insertions, 211 deletions
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
index d7179d5874..bbe36917fd 100644
--- a/usr/src/cmd/bhyvectl/bhyvectl.c
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -40,7 +40,7 @@
/*
* Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/cdefs.h>
@@ -93,6 +93,9 @@ usage(bool cpu_intel)
" [--cpu=<vcpu_number>]\n"
" [--create]\n"
" [--destroy]\n"
+#ifndef __FreeBSD__
+ " [--wrlock-cycle]\n"
+#endif
" [--get-all]\n"
" [--get-stats]\n"
" [--set-desc-ds]\n"
@@ -306,6 +309,9 @@ static int unassign_pptdev, bus, slot, func;
#endif
static int run;
static int get_cpu_topology;
+#ifndef __FreeBSD__
+static int wrlock_cycle;
+#endif
/*
* VMCB specific.
@@ -1479,6 +1485,9 @@ setup_options(bool cpu_intel)
{ "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 },
{ "get-intinfo", NO_ARG, &get_intinfo, 1 },
{ "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 },
+#ifndef __FreeBSD__
+ { "wrlock-cycle", NO_ARG, &wrlock_cycle, 1 },
+#endif
};
const struct option intel_opts[] = {
@@ -1903,6 +1912,13 @@ main(int argc, char *argv[])
}
}
+#ifndef __FreeBSD__
+ if (!error && wrlock_cycle) {
+ error = vm_wrlock_cycle(ctx);
+ exit(error);
+ }
+#endif /* __FreeBSD__ */
+
if (!error && memsize)
error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
index cecf22dd4c..f8fe636386 100644
--- a/usr/src/lib/libvmmapi/common/mapfile-vers
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -11,7 +11,7 @@
#
# Copyright 2013 Pluribus Networks Inc.
-# Copyright 2018 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
#
#
@@ -119,6 +119,7 @@ SYMBOL_VERSION ILLUMOSprivate {
vm_suspended_cpus;
vm_resume_cpu;
vm_unassign_pptdev;
+ vm_wrlock_cycle;
local:
*;
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index 9ef7c2eb20..bae214aba0 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -38,7 +38,7 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2015 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/cdefs.h>
@@ -1786,6 +1786,17 @@ vm_get_device_fd(struct vmctx *ctx)
return (ctx->fd);
}
+#ifndef __FreeBSD__
+int
+vm_wrlock_cycle(struct vmctx *ctx)
+{
+ if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) {
+ return (errno);
+ }
+ return (0);
+}
+#endif /* __FreeBSD__ */
+
#ifdef __FreeBSD__
const cap_ioctl_t *
vm_get_ioctls(size_t *len)
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index 0c372c70d0..6cb7a1186d 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -38,6 +38,7 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _VMMAPI_H_
@@ -271,6 +272,11 @@ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores,
int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores,
uint16_t *threads, uint16_t *maxcpus);
+#ifndef __FreeBSD__
+/* illumos-specific APIs */
+int vm_wrlock_cycle(struct vmctx *ctx);
+#endif /* __FreeBSD__ */
+
#ifdef __FreeBSD__
/*
* FreeBSD specific APIs
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
index bbcb970b22..80b5b07aaa 100644
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -390,6 +390,7 @@ enum viona_ring_state {
enum viona_ring_state_flags {
VRSF_REQ_START = 0x1, /* start running from INIT state */
VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */
+ VRSF_RENEW = 0x4, /* ring renewing lease */
};
#define VRING_NEED_BAIL(ring, proc) \
@@ -410,6 +411,7 @@ typedef struct viona_vring {
uint16_t vr_state_flags;
uint_t vr_xfer_outstanding;
kthread_t *vr_worker_thread;
+ vmm_lease_t *vr_lease;
/* ring-sized resources for TX activity */
viona_desb_t *vr_txdesb;
@@ -422,6 +424,7 @@ typedef struct viona_vring {
/* Internal ring-related state */
kmutex_t vr_a_mutex; /* sync consumers of 'avail' */
kmutex_t vr_u_mutex; /* sync consumers of 'used' */
+ uint64_t vr_pa;
uint16_t vr_size;
uint16_t vr_mask; /* cached from vr_size */
uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */
@@ -579,12 +582,14 @@ static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
-static void *viona_gpa2kva(viona_link_t *link, uint64_t gpa, size_t len);
+static void *viona_gpa2kva(viona_vring_t *, uint64_t, size_t);
static void viona_ring_alloc(viona_link_t *, viona_vring_t *);
static void viona_ring_free(viona_vring_t *);
static int viona_ring_reset(viona_vring_t *, boolean_t);
static kthread_t *viona_create_worker(viona_vring_t *);
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
static int viona_ioc_ring_init(viona_link_t *, void *, int);
@@ -600,6 +605,7 @@ static void viona_desb_release(viona_desb_t *);
static void viona_rx_classified(void *, mac_resource_handle_t, mblk_t *,
boolean_t);
static void viona_rx_mcast(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+static void viona_tx_wait_outstanding(viona_vring_t *);
static void viona_tx(viona_link_t *, viona_vring_t *);
static viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
@@ -917,7 +923,7 @@ viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
mutex_enter(&ss->ss_lock);
if ((link = ss->ss_link) == NULL || link->l_destroyed ||
- vmm_drv_expired(link->l_vm_hold)) {
+ vmm_drv_release_reqd(link->l_vm_hold)) {
mutex_exit(&ss->ss_lock);
return (ENXIO);
}
@@ -1250,9 +1256,75 @@ viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
* Translate a guest physical address into a kernel virtual address.
*/
static void *
-viona_gpa2kva(viona_link_t *link, uint64_t gpa, size_t len)
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
{
- return (vmm_drv_gpa2kva(link->l_vm_hold, gpa, len));
+ ASSERT3P(ring->vr_lease, !=, NULL);
+
+ return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+ viona_vring_t *ring = arg;
+
+ cv_broadcast(&ring->vr_cv);
+
+ /* The lease will be broken asynchronously. */
+ return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ if (ring->vr_lease != NULL) {
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+
+ /*
+ * Without an active lease, the ring mappings cannot be
+ * considered valid.
+ */
+ viona_ring_unmap(ring);
+
+ vmm_drv_lease_break(hold, ring->vr_lease);
+ ring->vr_lease = NULL;
+ }
+}
+
+static boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ viona_ring_lease_drop(ring);
+
+ /*
+ * Lease renewal will fail if the VM has requested that all holds be
+ * cleaned up.
+ */
+ ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+ ring);
+ if (ring->vr_lease != NULL) {
+ /* A ring undergoing renewal will need valid guest mappings */
+ if (ring->vr_pa != 0 && ring->vr_size != 0) {
+ /*
+ * If new mappings cannot be established, consider the
+ * lease renewal a failure.
+ */
+ if (!viona_ring_map(ring)) {
+ viona_ring_lease_drop(ring);
+ return (B_FALSE);
+ }
+ }
+ }
+ return (ring->vr_lease != NULL);
}
static void
@@ -1322,19 +1394,78 @@ viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
}
}
}
+ viona_ring_lease_drop(ring);
mutex_exit(&ring->vr_lock);
return (0);
}
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+ uint64_t pos = ring->vr_pa;
+ const uint16_t qsz = ring->vr_size;
+
+ ASSERT3U(qsz, !=, 0);
+ ASSERT3U(pos, !=, 0);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+ ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+ if (ring->vr_descr == NULL) {
+ goto fail;
+ }
+ pos += desc_sz;
+
+ const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+ ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+ if (ring->vr_avail_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_avail_idx = ring->vr_avail_flags + 1;
+ ring->vr_avail_ring = ring->vr_avail_flags + 2;
+ ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+ pos += avail_sz;
+
+ const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+ (sizeof (uint16_t) * 3);
+ pos = P2ROUNDUP(pos, VRING_ALIGN);
+ ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+ if (ring->vr_used_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_used_idx = ring->vr_used_flags + 1;
+ ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+ ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+ return (B_TRUE);
+
+fail:
+ viona_ring_unmap(ring);
+ return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ ring->vr_descr = NULL;
+ ring->vr_avail_flags = NULL;
+ ring->vr_avail_idx = NULL;
+ ring->vr_avail_ring = NULL;
+ ring->vr_avail_used_event = NULL;
+ ring->vr_used_flags = NULL;
+ ring->vr_used_idx = NULL;
+ ring->vr_used_ring = NULL;
+ ring->vr_used_avail_event = NULL;
+}
+
static int
viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
{
vioc_ring_init_t kri;
viona_vring_t *ring;
kthread_t *t;
- uintptr_t pos;
- size_t desc_sz, avail_sz, used_sz;
- uint16_t cnt;
int err = 0;
if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
@@ -1344,8 +1475,8 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
if (kri.ri_index >= VIONA_VQ_MAX) {
return (EINVAL);
}
- cnt = kri.ri_qsize;
- if (cnt == 0 || cnt > VRING_MAX_LEN || (1 << (ffs(cnt) - 1)) != cnt) {
+ const uint16_t qsz = kri.ri_qsize;
+ if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
return (EINVAL);
}
@@ -1357,39 +1488,19 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
}
VERIFY(ring->vr_state_flags == 0);
- pos = kri.ri_qaddr;
- desc_sz = cnt * sizeof (struct virtio_desc);
- avail_sz = (cnt + 3) * sizeof (uint16_t);
- used_sz = (cnt * sizeof (struct virtio_used)) + (sizeof (uint16_t) * 3);
-
- ring->vr_size = kri.ri_qsize;
- ring->vr_mask = (ring->vr_size - 1);
- ring->vr_descr = viona_gpa2kva(link, pos, desc_sz);
- if (ring->vr_descr == NULL) {
- err = EINVAL;
+ ring->vr_lease = NULL;
+ if (!viona_ring_lease_renew(ring)) {
+ err = EBUSY;
goto fail;
}
- pos += desc_sz;
- ring->vr_avail_flags = viona_gpa2kva(link, pos, avail_sz);
- if (ring->vr_avail_flags == NULL) {
+ ring->vr_size = qsz;
+ ring->vr_mask = (ring->vr_size - 1);
+ ring->vr_pa = kri.ri_qaddr;
+ if (!viona_ring_map(ring)) {
err = EINVAL;
goto fail;
}
- ring->vr_avail_idx = ring->vr_avail_flags + 1;
- ring->vr_avail_ring = ring->vr_avail_flags + 2;
- ring->vr_avail_used_event = ring->vr_avail_ring + cnt;
- pos += avail_sz;
-
- pos = P2ROUNDUP(pos, VRING_ALIGN);
- ring->vr_used_flags = viona_gpa2kva(link, pos, used_sz);
- if (ring->vr_used_flags == NULL) {
- err = EINVAL;
- goto fail;
- }
- ring->vr_used_idx = ring->vr_used_flags + 1;
- ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
- ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + cnt);
/* Initialize queue indexes */
ring->vr_cur_aidx = 0;
@@ -1398,9 +1509,9 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
if (kri.ri_index == VIONA_VQ_TX && !link->l_force_tx_copy) {
viona_desb_t *dp;
- dp = kmem_zalloc(sizeof (viona_desb_t) * cnt, KM_SLEEP);
+ dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
ring->vr_txdesb = dp;
- for (uint_t i = 0; i < cnt; i++, dp++) {
+ for (uint_t i = 0; i < qsz; i++, dp++) {
dp->d_frtn.free_func = viona_desb_release;
dp->d_frtn.free_arg = (void *)dp;
dp->d_ring = ring;
@@ -1411,7 +1522,7 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
/* Allocate ring-sized iovec buffers for TX */
if (kri.ri_index == VIONA_VQ_TX) {
- ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * cnt,
+ ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz,
KM_SLEEP);
}
@@ -1434,18 +1545,10 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
return (0);
fail:
+ viona_ring_lease_drop(ring);
viona_ring_misc_free(ring);
ring->vr_size = 0;
ring->vr_mask = 0;
- ring->vr_descr = NULL;
- ring->vr_avail_flags = NULL;
- ring->vr_avail_idx = NULL;
- ring->vr_avail_ring = NULL;
- ring->vr_avail_used_event = NULL;
- ring->vr_used_flags = NULL;
- ring->vr_used_idx = NULL;
- ring->vr_used_ring = NULL;
- ring->vr_used_avail_event = NULL;
mutex_exit(&ring->vr_lock);
return (err);
}
@@ -1591,6 +1694,25 @@ viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
do {
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ /*
+ * Set the renewal flag, causing incoming traffic to be
+ * dropped, and issue an RX barrier to ensure any
+ * threads in the RX callbacks will have finished.
+ * The vr_lock cannot be held across the barrier as it
+ * poses a deadlock risk.
+ */
+ ring->vr_state_flags |= VRSF_RENEW;
+ mutex_exit(&ring->vr_lock);
+ mac_rx_barrier(link->l_mch);
+ mutex_enter(&ring->vr_lock);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
+ }
+
/*
* For now, there is little to do in the RX worker as inbound
* data is delivered by MAC via the RX callbacks. If tap-like
@@ -1617,6 +1739,7 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
for (;;) {
boolean_t bail = B_FALSE;
+ boolean_t renew = B_FALSE;
uint_t ntx = 0;
*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
@@ -1644,7 +1767,8 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
*/
membar_enter();
bail = VRING_NEED_BAIL(ring, p);
- if (!bail && viona_vr_num_avail(ring)) {
+ renew = vmm_drv_lease_expired(ring->vr_lease);
+ if (!bail && !renew && viona_vr_num_avail(ring)) {
continue;
}
@@ -1653,26 +1777,35 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
}
mutex_enter(&ring->vr_lock);
- while (!bail && !viona_vr_num_avail(ring)) {
+
+ while (!bail && !renew && !viona_vr_num_avail(ring)) {
(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
bail = VRING_NEED_BAIL(ring, p);
+ renew = vmm_drv_lease_expired(ring->vr_lease);
}
+
if (bail) {
break;
+ } else if (renew) {
+ ring->vr_state_flags |= VRSF_RENEW;
+ /*
+ * When renewing the lease for the ring, no TX
+ * frames may be outstanding, as they contain
+ * references to guest memory.
+ */
+ viona_tx_wait_outstanding(ring);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
}
mutex_exit(&ring->vr_lock);
}
ASSERT(MUTEX_HELD(&ring->vr_lock));
- while (ring->vr_xfer_outstanding != 0) {
- /*
- * Paying heed to signals is counterproductive here. This is a
- * very tight loop if pending transfers take an extended amount
- * of time to be reclaimed while the host process is exiting.
- */
- cv_wait(&ring->vr_cv, &ring->vr_lock);
- }
+ viona_tx_wait_outstanding(ring);
}
static void
@@ -1695,6 +1828,16 @@ viona_worker(void *arg)
cv_broadcast(&ring->vr_cv);
while (ring->vr_state_flags == 0) {
+ /*
+ * Keeping lease renewals timely while waiting for the ring to
+ * be started is important for avoiding deadlocks.
+ */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
if (VRING_NEED_BAIL(ring, p)) {
@@ -1706,6 +1849,13 @@ viona_worker(void *arg)
ring->vr_state = VRS_RUN;
ring->vr_state_flags &= ~VRSF_REQ_START;
+ /* Ensure ring lease is valid first */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
/* Process actual work */
if (ring == &link->l_vrings[VIONA_VQ_RX]) {
viona_worker_rx(ring, link);
@@ -1725,6 +1875,7 @@ cleanup:
}
viona_ring_misc_free(ring);
+ viona_ring_lease_drop(ring);
ring->vr_cur_aidx = 0;
ring->vr_state = VRS_RESET;
ring->vr_state_flags = 0;
@@ -1799,7 +1950,6 @@ viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
static int
vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
{
- viona_link_t *link = ring->vr_link;
uint_t i, ndesc, idx, head, next;
struct virtio_desc vdir;
void *buf;
@@ -1848,7 +1998,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
VIONA_RING_STAT_INCR(ring, desc_bad_len);
goto bail;
}
- buf = viona_gpa2kva(link, vdir.vd_addr, vdir.vd_len);
+ buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
if (buf == NULL) {
VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
VIONA_RING_STAT_INCR(ring, bad_ring_addr);
@@ -1868,7 +2018,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
VIONA_RING_STAT_INCR(ring, indir_bad_len);
goto bail;
}
- vindir = viona_gpa2kva(link, vdir.vd_addr, vdir.vd_len);
+ vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
if (vindir == NULL) {
VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
VIONA_RING_STAT_INCR(ring, bad_ring_addr);
@@ -1901,7 +2051,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
desc_bad_len);
goto bail;
}
- buf = viona_gpa2kva(link, vp.vd_addr,
+ buf = viona_gpa2kva(ring, vp.vd_addr,
vp.vd_len);
if (buf == NULL) {
VIONA_PROBE_BAD_RING_ADDR(ring,
@@ -2004,7 +2154,7 @@ viona_intr_ring(viona_vring_t *ring)
uint64_t msg = ring->vr_msi_msg;
mutex_exit(&ring->vr_lock);
- (void) vmm_drv_msi(ring->vr_link->l_vm_hold, addr, msg);
+ (void) vmm_drv_msi(ring->vr_lease, addr, msg);
return;
}
mutex_exit(&ring->vr_lock);
@@ -2528,8 +2678,9 @@ viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
{
viona_vring_t *ring = (viona_vring_t *)arg;
- /* Immediately drop traffic if ring is inactive */
- if (ring->vr_state != VRS_RUN) {
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
freemsgchain(mp);
return;
}
@@ -2546,8 +2697,9 @@ viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
mblk_t *mp_mcast_only = NULL;
mblk_t **mpp = &mp_mcast_only;
- /* Immediately drop traffic if ring is inactive */
- if (ring->vr_state != VRS_RUN) {
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
freemsgchain(mp);
return;
}
@@ -2651,6 +2803,21 @@ viona_desb_release(viona_desb_t *dp)
mutex_exit(&ring->vr_lock);
}
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ while (ring->vr_xfer_outstanding != 0) {
+ /*
+ * Paying heed to signals is counterproductive here. This is a
+ * very tight loop if pending transfers take an extended amount
+ * of time to be reclaimed while the host process is exiting.
+ */
+ cv_wait(&ring->vr_cv, &ring->vr_lock);
+ }
+}
+
static boolean_t
viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
mblk_t *mp, uint32_t len)
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 991d6c7850..47a5f26cb7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -197,7 +197,6 @@ struct vm {
uint16_t threads; /* (o) num of threads/core */
uint16_t maxcpus; /* (o) max pluggable cpus */
#ifndef __FreeBSD__
- krwlock_t ioport_rwlock;
list_t ioport_hooks;
#endif /* __FreeBSD__ */
};
@@ -526,7 +525,6 @@ vm_init(struct vm *vm, bool create)
vm->vrtc = vrtc_init(vm);
#ifndef __FreeBSD__
if (create) {
- rw_init(&vm->ioport_rwlock, NULL, RW_DEFAULT, NULL);
list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t),
offsetof (vm_ioport_hook_t, vmih_node));
} else {
@@ -3135,7 +3133,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc,
return (EINVAL);
}
- rw_enter(&vm->ioport_rwlock, RW_WRITER);
/*
* Find the node position in the list which this region should be
* inserted behind to maintain sorted order.
@@ -3143,7 +3140,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc,
for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) {
if (ioport == node->vmih_ioport) {
/* Reject duplicate port hook */
- rw_exit(&vm->ioport_rwlock);
return (EEXIST);
} else if (ioport > node->vmih_ioport) {
break;
@@ -3162,7 +3158,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc,
}
*cookie = (void *)hook;
- rw_exit(&vm->ioport_rwlock);
return (0);
}
@@ -3172,12 +3167,10 @@ vm_ioport_unhook(struct vm *vm, void **cookie)
vm_ioport_hook_t *hook;
list_t *ih = &vm->ioport_hooks;
- rw_enter(&vm->ioport_rwlock, RW_WRITER);
hook = *cookie;
list_remove(ih, hook);
kmem_free(hook, sizeof (*hook));
*cookie = NULL;
- rw_exit(&vm->ioport_rwlock);
}
int
@@ -3188,38 +3181,32 @@ vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes,
list_t *ih = &vm->ioport_hooks;
int err = 0;
- rw_enter(&vm->ioport_rwlock, RW_READER);
for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) {
if (hook->vmih_ioport == port) {
break;
}
}
if (hook == NULL) {
- err = ENOENT;
- goto bail;
+ return (ENOENT);
}
if (in) {
uint64_t tval;
if (hook->vmih_rmem_cb == NULL) {
- err = ENOENT;
- goto bail;
+ return (ENOENT);
}
err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port,
(uint_t)bytes, &tval);
*val = (uint32_t)tval;
} else {
if (hook->vmih_wmem_cb == NULL) {
- err = ENOENT;
- goto bail;
+ return (ENOENT);
}
err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port,
(uint_t)bytes, (uint64_t)*val);
}
-bail:
- rw_exit(&vm->ioport_rwlock);
return (err);
}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile
index 2059dfcc97..83c14de895 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.mapfile
+++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile
@@ -39,7 +39,10 @@ SYMBOL_VERSION ILLUMOSprivate {
# bhyve driver API
vmm_drv_hold;
vmm_drv_rele;
- vmm_drv_expired;
+ vmm_drv_release_reqd;
+ vmm_drv_lease_sign;
+ vmm_drv_lease_break;
+ vmm_drv_lease_expired;
vmm_drv_gpa2kva;
vmm_drv_ioport_hook;
vmm_drv_ioport_unhook;
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index d20732ee1e..a5e60d4887 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -85,11 +85,22 @@ extern int vmx_x86_supported(const char **);
struct vmm_hold {
list_node_t vmh_node;
vmm_softc_t *vmh_sc;
- boolean_t vmh_expired;
+ boolean_t vmh_release_req;
uint_t vmh_ioport_hook_cnt;
};
+struct vmm_lease {
+ list_node_t vml_node;
+ struct vm *vml_vm;
+ boolean_t vml_expired;
+ boolean_t (*vml_expire_func)(void *);
+ void *vml_expire_arg;
+ list_node_t vml_expire_node;
+ struct vmm_hold *vml_hold;
+};
+
static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
+static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
static int
vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
@@ -224,63 +235,164 @@ vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
return (error);
}
+/*
+ * Resource Locking and Exclusion
+ *
+ * Much of bhyve depends on key portions of VM state, such as the guest memory
+ * map, to remain unchanged while the guest is running. As ported from
+ * FreeBSD, the initial strategy for this resource exclusion hinged on gating
+ * access to the instance vCPUs. Threads acting on a single vCPU, like those
+ * performing the work of actually running the guest in VMX/SVM, would lock
+ * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
+ * state, all of the vCPUs would be first locked, ensuring that the
+ * operation(s) could complete without any other threads stumbling into
+ * intermediate states.
+ *
+ * This approach is largely effective for bhyve. Common operations, such as
+ * running the vCPUs, steer clear of lock contention. The model begins to
+ * break down for operations which do not occur in the context of a specific
+ * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
+ * thread in the bhyve process. In order to properly protect those vCPU-less
+ * operations from encountering invalid states, additional locking is required.
+ * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
+ * It does mean that class of operations will be serialized on locking the
+ * specific vCPU and that instances sized at VM_MAXCPU will potentially see
+ * undue contention on the VM_MAXCPU-1 vCPU.
+ *
+ * In order to address the shortcomings of this model, the concept of a
+ * read/write lock has been added to bhyve. Operations which change
+ * fundamental aspects of a VM (such as the memory map) must acquire the write
+ * lock, which also implies locking all of the vCPUs and waiting for all read
+ * lock holders to release. While it increases the cost and waiting time for
+ * those few operations, it allows most hot-path operations on the VM (which
+ * depend on its configuration remaining stable) to occur with minimal locking.
+ *
+ * Consumers of the Driver API (see below) are a special case when it comes to
+ * this locking, since they may hold a read lock via the drv_lease mechanism
+ * for an extended period of time. Rather than forcing those consumers to
+ * continuously poll for a write lock attempt, the lease system forces them to
+ * provide a release callback to trigger their clean-up (and potential later
+ * reacquisition) of the read lock.
+ */
-static int
+static void
vcpu_lock_one(vmm_softc_t *sc, int vcpu)
{
- int error;
-
- if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm))
- return (EINVAL);
+ ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
- error = vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true);
- return (error);
+ /*
+ * Since this state transition is utilizing from_idle=true, it should
+ * not fail, but rather block until it can be successful.
+ */
+ VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
}
static void
vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
{
- enum vcpu_state state;
-
- state = vcpu_get_state(sc->vmm_vm, vcpu, NULL);
- if (state != VCPU_FROZEN) {
- panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vmm_vm),
- vcpu, state);
- }
+ ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
+ VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
}
-static int
-vcpu_lock_all(vmm_softc_t *sc)
+static void
+vmm_read_lock(vmm_softc_t *sc)
+{
+ rw_enter(&sc->vmm_rwlock, RW_READER);
+}
+
+static void
+vmm_read_unlock(vmm_softc_t *sc)
+{
+ rw_exit(&sc->vmm_rwlock);
+}
+
+static void
+vmm_write_lock(vmm_softc_t *sc)
{
- int error = 0, vcpu;
- uint16_t maxcpus;
+ int maxcpus;
+ /* First lock all the vCPUs */
maxcpus = vm_get_maxcpus(sc->vmm_vm);
- for (vcpu = 0; vcpu < maxcpus; vcpu++) {
- error = vcpu_lock_one(sc, vcpu);
- if (error)
- break;
+ for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
+ vcpu_lock_one(sc, vcpu);
}
- if (error) {
- while (--vcpu >= 0)
- vcpu_unlock_one(sc, vcpu);
+ mutex_enter(&sc->vmm_lease_lock);
+ VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
+ sc->vmm_lease_blocker++;
+ if (sc->vmm_lease_blocker == 1) {
+ list_t *list = &sc->vmm_lease_list;
+ vmm_lease_t *lease = list_head(list);
+
+ while (lease != NULL) {
+ boolean_t sync_break = B_FALSE;
+
+ if (!lease->vml_expired) {
+ void *arg = lease->vml_expire_arg;
+ lease->vml_expired = B_TRUE;
+ sync_break = lease->vml_expire_func(arg);
+ }
+
+ if (sync_break) {
+ vmm_lease_t *next;
+
+ /*
+ * These leases which are synchronously broken
+ * result in vmm_read_unlock() calls from a
+ * different thread than the corresponding
+ * vmm_read_lock(). This is acceptable, given
+ * that the rwlock underpinning the whole
+ * mechanism tolerates the behavior. This
+ * flexibility is _only_ afforded to VM read
+ * lock (RW_READER) holders.
+ */
+ next = list_next(list, lease);
+ vmm_lease_break_locked(sc, lease);
+ lease = next;
+ } else {
+ lease = list_next(list, lease);
+ }
+ }
}
+ mutex_exit(&sc->vmm_lease_lock);
- return (error);
+ rw_enter(&sc->vmm_rwlock, RW_WRITER);
+ /*
+ * For now, the 'maxcpus' value for an instance is fixed at the
+ * compile-time constant of VM_MAXCPU at creation. If this changes in
+ * the future, allowing for dynamic vCPU resource sizing, acquisition
+ * of the write lock will need to be wary of such changes.
+ */
+ VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
}
static void
-vcpu_unlock_all(vmm_softc_t *sc)
+vmm_write_unlock(vmm_softc_t *sc)
{
- int vcpu;
- uint16_t maxcpus;
+ int maxcpus;
+
+ mutex_enter(&sc->vmm_lease_lock);
+ VERIFY3U(sc->vmm_lease_blocker, !=, 0);
+ sc->vmm_lease_blocker--;
+ if (sc->vmm_lease_blocker == 0) {
+ cv_broadcast(&sc->vmm_lease_cv);
+ }
+ mutex_exit(&sc->vmm_lease_lock);
+ /*
+ * The VM write lock _must_ be released from the same thread it was
+ * acquired in, unlike the read lock.
+ */
+ VERIFY(rw_write_held(&sc->vmm_rwlock));
+ rw_exit(&sc->vmm_rwlock);
+
+ /* Unlock all the vCPUs */
maxcpus = vm_get_maxcpus(sc->vmm_vm);
- for (vcpu = 0; vcpu < maxcpus; vcpu++)
+ for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
vcpu_unlock_one(sc, vcpu);
+ }
}
static int
@@ -289,11 +401,14 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
{
int error = 0, vcpu = -1;
void *datap = (void *)arg;
- boolean_t locked_one = B_FALSE, locked_all = B_FALSE;
-
- /*
- * Some VMM ioctls can operate only on vcpus that are not running.
- */
+ enum vm_lock_type {
+ LOCK_NONE = 0,
+ LOCK_VCPU,
+ LOCK_READ_HOLD,
+ LOCK_WRITE_HOLD
+ } lock_type = LOCK_NONE;
+
+ /* Acquire any exclusion resources needed for the operation. */
switch (cmd) {
case VM_RUN:
case VM_GET_REGISTER:
@@ -324,53 +439,52 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
return (EFAULT);
}
- if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
- error = EINVAL;
- goto done;
+ if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
+ return (EINVAL);
}
-
- error = vcpu_lock_one(sc, vcpu);
- if (error)
- goto done;
- locked_one = B_TRUE;
+ vcpu_lock_one(sc, vcpu);
+ lock_type = LOCK_VCPU;
break;
- case VM_MAP_PPTDEV_MMIO:
+ case VM_REINIT:
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
+ case VM_MAP_PPTDEV_MMIO:
case VM_ALLOC_MEMSEG:
case VM_MMAP_MEMSEG:
- case VM_REINIT:
- /*
- * All vCPUs must be prevented from running when performing
- * operations which act upon the entire VM.
- */
- error = vcpu_lock_all(sc);
- if (error)
- goto done;
- locked_all = B_TRUE;
+ case VM_WRLOCK_CYCLE:
+ vmm_write_lock(sc);
+ lock_type = LOCK_WRITE_HOLD;
break;
+ case VM_GET_GPA_PMAP:
case VM_GET_MEMSEG:
case VM_MMAP_GETNEXT:
+ case VM_LAPIC_IRQ:
+ case VM_INJECT_NMI:
+ case VM_IOAPIC_ASSERT_IRQ:
+ case VM_IOAPIC_DEASSERT_IRQ:
+ case VM_IOAPIC_PULSE_IRQ:
+ case VM_LAPIC_MSI:
+ case VM_LAPIC_LOCAL_IRQ:
+ case VM_GET_X2APIC_STATE:
+ case VM_RTC_READ:
+ case VM_RTC_WRITE:
+ case VM_RTC_SETTIME:
+ case VM_RTC_GETTIME:
#ifndef __FreeBSD__
case VM_DEVMEM_GETOFFSET:
#endif
- /*
- * Lock a vcpu to make sure that the memory map cannot be
- * modified while it is being inspected.
- */
- vcpu = vm_get_maxcpus(sc->vmm_vm) - 1;
- error = vcpu_lock_one(sc, vcpu);
- if (error)
- goto done;
- locked_one = B_TRUE;
+ vmm_read_lock(sc);
+ lock_type = LOCK_READ_HOLD;
break;
+ case VM_IOAPIC_PINCOUNT:
default:
break;
}
+ /* Execute the primary logic for the ioctl. */
switch (cmd) {
case VM_RUN: {
struct vm_run vmrun;
@@ -978,27 +1092,17 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
case VM_SUSPEND_CPU:
if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
error = EFAULT;
- break;
- }
- if (vcpu < -1 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
- error = EINVAL;
- break;
+ } else {
+ error = vm_suspend_cpu(sc->vmm_vm, vcpu);
}
-
- error = vm_suspend_cpu(sc->vmm_vm, vcpu);
break;
case VM_RESUME_CPU:
if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
error = EFAULT;
- break;
- }
- if (vcpu < -1 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
- error = EINVAL;
- break;
+ } else {
+ error = vm_resume_cpu(sc->vmm_vm, vcpu);
}
-
- error = vm_resume_cpu(sc->vmm_vm, vcpu);
break;
case VM_GET_CPUS: {
@@ -1167,22 +1271,37 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
}
break;
}
+ case VM_WRLOCK_CYCLE: {
+ /*
+ * Present a test mechanism to acquire/release the write lock
+ * on the VM without any other effects.
+ */
+ break;
+ }
#endif
default:
error = ENOTTY;
break;
}
- /* Release any vCPUs that were locked for the operation */
- if (locked_one) {
+ /* Release exclusion resources */
+ switch (lock_type) {
+ case LOCK_NONE:
+ break;
+ case LOCK_VCPU:
vcpu_unlock_one(sc, vcpu);
- } else if (locked_all) {
- vcpu_unlock_all(sc);
+ break;
+ case LOCK_READ_HOLD:
+ vmm_read_unlock(sc);
+ break;
+ case LOCK_WRITE_HOLD:
+ vmm_write_unlock(sc);
+ break;
+ default:
+ panic("unexpected lock type");
+ break;
}
-done:
- /* Make sure that no handler returns a bogus value like ERESTART */
- KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
return (error);
}
@@ -1251,10 +1370,17 @@ vmmdev_do_vm_create(char *name, cred_t *cr)
sc->vmm_minor = minor;
list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
offsetof(vmm_devmem_entry_t, vde_node));
+
list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
offsetof(vmm_hold_t, vmh_node));
cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
+ offsetof(vmm_lease_t, vml_node));
+ cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
+ rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
+
sc->vmm_zone = crgetzone(cr);
zone_hold(sc->vmm_zone);
vmm_zsd_add_vm(sc);
@@ -1275,6 +1401,23 @@ fail:
return (error);
}
+/*
+ * Bhyve 'Driver' Interface
+ *
+ * While many devices are emulated in the bhyve userspace process, there are
+ * others with performance constraints which require that they run mostly or
+ * entirely in-kernel. For those not integrated directly into bhyve, an API is
+ * needed so they can query/manipulate the portions of VM state needed to
+ * fulfill their purpose.
+ *
+ * This includes:
+ * - Translating guest-physical addresses to host-virtual pointers
+ * - Injecting MSIs
+ * - Hooking IO port addresses
+ *
+ * The vmm_drv interface exists to provide that functionality to its consumers.
+ * (At this time, 'viona' is the only user)
+ */
int
vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
{
@@ -1311,7 +1454,8 @@ vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
hold->vmh_sc = sc;
- hold->vmh_expired = B_FALSE;
+ hold->vmh_release_req = B_FALSE;
+
list_insert_tail(&sc->vmm_holds, hold);
sc->vmm_flags |= VMM_HELD;
*holdp = hold;
@@ -1342,25 +1486,87 @@ vmm_drv_rele(vmm_hold_t *hold)
}
boolean_t
-vmm_drv_expired(vmm_hold_t *hold)
+vmm_drv_release_reqd(vmm_hold_t *hold)
{
ASSERT(hold != NULL);
- return (hold->vmh_expired);
+ return (hold->vmh_release_req);
+}
+
+vmm_lease_t *
+vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
+{
+ vmm_softc_t *sc = hold->vmh_sc;
+ vmm_lease_t *lease;
+
+ ASSERT3P(expiref, !=, NULL);
+
+ if (hold->vmh_release_req) {
+ return (NULL);
+ }
+
+ lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
+ list_link_init(&lease->vml_node);
+ lease->vml_expire_func = expiref;
+ lease->vml_expire_arg = arg;
+ lease->vml_expired = B_FALSE;
+ lease->vml_hold = hold;
+ /* cache the VM pointer for one less pointer chase */
+ lease->vml_vm = sc->vmm_vm;
+
+ mutex_enter(&sc->vmm_lease_lock);
+ while (sc->vmm_lease_blocker != 0) {
+ cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
+ }
+ list_insert_tail(&sc->vmm_lease_list, lease);
+ vmm_read_lock(sc);
+ mutex_exit(&sc->vmm_lease_lock);
+
+ return (lease);
+}
+
+static void
+vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
+{
+ ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
+
+ list_remove(&sc->vmm_lease_list, lease);
+ vmm_read_unlock(sc);
+ kmem_free(lease, sizeof (*lease));
+}
+
+void
+vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
+{
+ vmm_softc_t *sc = hold->vmh_sc;
+
+ VERIFY3P(hold, ==, lease->vml_hold);
+
+ mutex_enter(&sc->vmm_lease_lock);
+ vmm_lease_break_locked(sc, lease);
+ mutex_exit(&sc->vmm_lease_lock);
+}
+
+boolean_t
+vmm_drv_lease_expired(vmm_lease_t *lease)
+{
+ return (lease->vml_expired);
}
void *
-vmm_drv_gpa2kva(vmm_hold_t *hold, uintptr_t gpa, size_t sz)
+vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
{
- struct vm *vm;
- struct vmspace *vmspace;
+ ASSERT(lease != NULL);
- ASSERT(hold != NULL);
+ return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
+}
- vm = hold->vmh_sc->vmm_vm;
- vmspace = vm_get_vmspace(vm);
+int
+vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
+{
+ ASSERT(lease != NULL);
- return (vmspace_find_kva(vmspace, gpa, sz));
+ return (lapic_intr_msi(lease->vml_vm, addr, msg));
}
int
@@ -1387,8 +1593,10 @@ vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc,
hold->vmh_ioport_hook_cnt++;
mutex_exit(&vmm_mtx);
+ vmm_write_lock(sc);
err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc,
(vmm_wmem_cb_t)wfunc, arg, cookie);
+ vmm_write_unlock(sc);
if (err != 0) {
mutex_enter(&vmm_mtx);
@@ -1409,24 +1617,15 @@ vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
ASSERT(hold->vmh_ioport_hook_cnt != 0);
sc = hold->vmh_sc;
+ vmm_write_lock(sc);
vm_ioport_unhook(sc->vmm_vm, cookie);
+ vmm_write_unlock(sc);
mutex_enter(&vmm_mtx);
hold->vmh_ioport_hook_cnt--;
mutex_exit(&vmm_mtx);
}
-int
-vmm_drv_msi(vmm_hold_t *hold, uint64_t addr, uint64_t msg)
-{
- struct vm *vm;
-
- ASSERT(hold != NULL);
-
- vm = hold->vmh_sc->vmm_vm;
- return (lapic_intr_msi(vm, addr, msg));
-}
-
static int
vmm_drv_purge(vmm_softc_t *sc)
{
@@ -1438,7 +1637,7 @@ vmm_drv_purge(vmm_softc_t *sc)
sc->vmm_flags |= VMM_CLEANUP;
for (hold = list_head(&sc->vmm_holds); hold != NULL;
hold = list_next(&sc->vmm_holds, hold)) {
- hold->vmh_expired = B_TRUE;
+ hold->vmh_release_req = B_TRUE;
}
while ((sc->vmm_flags & VMM_HELD) != 0) {
if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
@@ -1730,10 +1929,8 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
if (sc->vmm_flags & VMM_DESTROY)
return (ENXIO);
- /* Get a read lock on the guest memory map by freezing any vcpu. */
- if ((err = vcpu_lock_all(sc)) != 0) {
- return (err);
- }
+ /* Grab read lock on the VM to prevent any changes to the memory map */
+ vmm_read_lock(sc);
vm = sc->vmm_vm;
vms = vm_get_vmspace(vm);
@@ -1758,7 +1955,7 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
out:
- vcpu_unlock_all(sc);
+ vmm_read_unlock(sc);
return (err);
}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
index 58a62586a1..66a67d9529 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -148,8 +148,13 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
vmspace_mapping_t *vmsm;
void *result = NULL;
- mutex_enter(&vms->vms_lock);
- vmsm = vm_mapping_find(vms, addr, size, B_FALSE);
+ /*
+ * Since vmspace_find_kva is provided so that vmm_drv consumers can do
+ * GPA2KVA translations, it is expected to be called when there is a
+ * read lock preventing vmspace alterations. As such, it can do the
+ * lockless vm_mapping_find() lookup.
+ */
+ vmsm = vm_mapping_find(vms, addr, size, B_TRUE);
if (vmsm != NULL) {
struct vm_object *vmo = vmsm->vmsm_object;
@@ -162,7 +167,6 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
break;
}
}
- mutex_exit(&vms->vms_lock);
return (result);
}
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
index 63ccc36dc6..dd87dcb0a6 100644
--- a/usr/src/uts/i86pc/sys/vmm_dev.h
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -38,7 +38,7 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2015 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _VMM_DEV_H_
@@ -387,6 +387,7 @@ enum {
#ifndef __FreeBSD__
/* illumos-custom ioctls */
IOCNUM_DEVMEM_GETOFFSET = 256,
+ IOCNUM_WRLOCK_CYCLE = 257,
#endif
};
@@ -504,6 +505,7 @@ enum {
#ifndef __FreeBSD__
#define VM_DEVMEM_GETOFFSET \
_IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset)
+#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE)
/* ioctls used against ctl device for vm create/destroy */
#define VMM_IOC_BASE (('V' << 16) | ('M' << 8))
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index b883070abf..33fefc10ea 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _VMM_DRV_H_
@@ -20,6 +20,9 @@
struct vmm_hold;
typedef struct vmm_hold vmm_hold_t;
+struct vmm_lease;
+typedef struct vmm_lease vmm_lease_t;
+
/*
* Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t
* counterparts in vmm.h.
@@ -29,12 +32,19 @@ typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t);
extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **);
extern void vmm_drv_rele(vmm_hold_t *);
-extern boolean_t vmm_drv_expired(vmm_hold_t *);
-extern void *vmm_drv_gpa2kva(vmm_hold_t *, uintptr_t, size_t);
+extern boolean_t vmm_drv_release_reqd(vmm_hold_t *);
+
+extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *),
+ void *);
+extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *);
+extern boolean_t vmm_drv_lease_expired(vmm_lease_t *);
+
+extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t);
+extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t);
+
extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t,
vmm_drv_wmem_cb_t, void *, void **);
extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **);
-extern int vmm_drv_msi(vmm_hold_t *, uint64_t, uint64_t);
#endif /* _KERNEL */
#endif /* _VMM_DRV_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h
index 8fa19c8247..cdc56cc464 100644
--- a/usr/src/uts/i86pc/sys/vmm_impl.h
+++ b/usr/src/uts/i86pc/sys/vmm_impl.h
@@ -11,7 +11,7 @@
/*
* Copyright 2014 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _VMM_IMPL_H_
@@ -46,7 +46,7 @@ typedef struct vmm_devmem_entry vmm_devmem_entry_t;
typedef struct vmm_zsd vmm_zsd_t;
enum vmm_softc_state {
- VMM_HELD = 1, /* external driver(s) possess hold on VM */
+ VMM_HELD = 1, /* external driver(s) possess hold on the VM */
VMM_CLEANUP = 2, /* request that holds are released */
VMM_PURGED = 4, /* all hold have been released */
VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */
@@ -58,11 +58,18 @@ struct vmm_softc {
struct vm *vmm_vm;
minor_t vmm_minor;
char vmm_name[VM_MAX_NAMELEN];
- uint_t vmm_flags;
- boolean_t vmm_is_open;
list_t vmm_devmem_list;
- list_t vmm_holds;
+
kcondvar_t vmm_cv;
+ list_t vmm_holds;
+ uint_t vmm_flags;
+ boolean_t vmm_is_open;
+
+ kmutex_t vmm_lease_lock;
+ list_t vmm_lease_list;
+ uint_t vmm_lease_blocker;
+ kcondvar_t vmm_lease_cv;
+ krwlock_t vmm_rwlock;
/* For zone specific data */
list_node_t vmm_zsd_linkage;