diff options
| -rw-r--r-- | usr/src/cmd/bhyvectl/bhyvectl.c | 18 | ||||
| -rw-r--r-- | usr/src/lib/libvmmapi/common/mapfile-vers | 3 | ||||
| -rw-r--r-- | usr/src/lib/libvmmapi/common/vmmapi.c | 13 | ||||
| -rw-r--r-- | usr/src/lib/libvmmapi/common/vmmapi.h | 6 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/viona/viona.c | 301 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm.c | 19 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm.mapfile | 5 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c | 419 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c | 10 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_dev.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_drv.h | 18 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/sys/vmm_impl.h | 17 |
12 files changed, 622 insertions, 211 deletions
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index d7179d5874..bbe36917fd 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -40,7 +40,7 @@ /* * Copyright 2015 Pluribus Networks Inc. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/cdefs.h> @@ -93,6 +93,9 @@ usage(bool cpu_intel) " [--cpu=<vcpu_number>]\n" " [--create]\n" " [--destroy]\n" +#ifndef __FreeBSD__ + " [--wrlock-cycle]\n" +#endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" @@ -306,6 +309,9 @@ static int unassign_pptdev, bus, slot, func; #endif static int run; static int get_cpu_topology; +#ifndef __FreeBSD__ +static int wrlock_cycle; +#endif /* * VMCB specific. @@ -1479,6 +1485,9 @@ setup_options(bool cpu_intel) { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, +#ifndef __FreeBSD__ + { "wrlock-cycle", NO_ARG, &wrlock_cycle, 1 }, +#endif }; const struct option intel_opts[] = { @@ -1903,6 +1912,13 @@ main(int argc, char *argv[]) } } +#ifndef __FreeBSD__ + if (!error && wrlock_cycle) { + error = vm_wrlock_cycle(ctx); + exit(error); + } +#endif /* __FreeBSD__ */ + if (!error && memsize) error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers index cecf22dd4c..f8fe636386 100644 --- a/usr/src/lib/libvmmapi/common/mapfile-vers +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -11,7 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. -# Copyright 2018 Joyent, Inc. +# Copyright 2019 Joyent, Inc. # # @@ -119,6 +119,7 @@ SYMBOL_VERSION ILLUMOSprivate { vm_suspended_cpus; vm_resume_cpu; vm_unassign_pptdev; + vm_wrlock_cycle; local: *; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index 9ef7c2eb20..bae214aba0 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -38,7 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/cdefs.h> @@ -1786,6 +1786,17 @@ vm_get_device_fd(struct vmctx *ctx) return (ctx->fd); } +#ifndef __FreeBSD__ +int +vm_wrlock_cycle(struct vmctx *ctx) +{ + if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) { + return (errno); + } + return (0); +} +#endif /* __FreeBSD__ */ + #ifdef __FreeBSD__ const cap_ioctl_t * vm_get_ioctls(size_t *len) diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index 0c372c70d0..6cb7a1186d 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -38,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMMAPI_H_ @@ -271,6 +272,11 @@ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); +#ifndef __FreeBSD__ +/* illumos-specific APIs */ +int vm_wrlock_cycle(struct vmctx *ctx); +#endif /* __FreeBSD__ */ + #ifdef __FreeBSD__ /* * FreeBSD specific APIs diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c index bbcb970b22..80b5b07aaa 100644 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -390,6 +390,7 @@ enum viona_ring_state { enum viona_ring_state_flags { VRSF_REQ_START = 0x1, /* start running from INIT state */ VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */ + VRSF_RENEW = 0x4, /* ring renewing lease */ }; #define VRING_NEED_BAIL(ring, proc) \ @@ -410,6 +411,7 @@ typedef struct viona_vring { uint16_t vr_state_flags; uint_t vr_xfer_outstanding; kthread_t *vr_worker_thread; + vmm_lease_t *vr_lease; /* ring-sized resources for TX activity */ viona_desb_t *vr_txdesb; @@ -422,6 +424,7 @@ typedef struct viona_vring { /* Internal ring-related state */ kmutex_t vr_a_mutex; /* sync consumers of 'avail' */ kmutex_t vr_u_mutex; /* sync consumers of 'used' */ + uint64_t vr_pa; uint16_t vr_size; uint16_t vr_mask; /* cached from vr_size */ uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */ @@ -579,12 +582,14 @@ static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); static int viona_ioc_delete(viona_soft_state_t *, boolean_t); -static void *viona_gpa2kva(viona_link_t *link, uint64_t gpa, size_t len); +static void *viona_gpa2kva(viona_vring_t *, uint64_t, size_t); static void viona_ring_alloc(viona_link_t *, viona_vring_t *); static void viona_ring_free(viona_vring_t *); static int viona_ring_reset(viona_vring_t *, boolean_t); static kthread_t *viona_create_worker(viona_vring_t *); +static boolean_t viona_ring_map(viona_vring_t *); +static void viona_ring_unmap(viona_vring_t *); static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t); static int viona_ioc_ring_init(viona_link_t *, void *, int); @@ -600,6 +605,7 @@ static void viona_desb_release(viona_desb_t *); static void viona_rx_classified(void *, mac_resource_handle_t, mblk_t *, boolean_t); static void viona_rx_mcast(void *, mac_resource_handle_t, mblk_t *, boolean_t); +static void viona_tx_wait_outstanding(viona_vring_t *); static void viona_tx(viona_link_t *, viona_vring_t *); static viona_neti_t *viona_neti_lookup_by_zid(zoneid_t); @@ -917,7 +923,7 @@ viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) mutex_enter(&ss->ss_lock); if ((link = ss->ss_link) == NULL || link->l_destroyed || - vmm_drv_expired(link->l_vm_hold)) { + vmm_drv_release_reqd(link->l_vm_hold)) { mutex_exit(&ss->ss_lock); return (ENXIO); } @@ -1250,9 +1256,75 @@ viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) * Translate a guest physical address into a kernel virtual address. */ static void * -viona_gpa2kva(viona_link_t *link, uint64_t gpa, size_t len) +viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len) { - return (vmm_drv_gpa2kva(link->l_vm_hold, gpa, len)); + ASSERT3P(ring->vr_lease, !=, NULL); + + return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len)); +} + +static boolean_t +viona_ring_lease_expire_cb(void *arg) +{ + viona_vring_t *ring = arg; + + cv_broadcast(&ring->vr_cv); + + /* The lease will be broken asynchronously. */ + return (B_FALSE); +} + +static void +viona_ring_lease_drop(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + if (ring->vr_lease != NULL) { + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + + /* + * Without an active lease, the ring mappings cannot be + * considered valid. + */ + viona_ring_unmap(ring); + + vmm_drv_lease_break(hold, ring->vr_lease); + ring->vr_lease = NULL; + } +} + +static boolean_t +viona_ring_lease_renew(viona_vring_t *ring) +{ + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + viona_ring_lease_drop(ring); + + /* + * Lease renewal will fail if the VM has requested that all holds be + * cleaned up. + */ + ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, + ring); + if (ring->vr_lease != NULL) { + /* A ring undergoing renewal will need valid guest mappings */ + if (ring->vr_pa != 0 && ring->vr_size != 0) { + /* + * If new mappings cannot be established, consider the + * lease renewal a failure. + */ + if (!viona_ring_map(ring)) { + viona_ring_lease_drop(ring); + return (B_FALSE); + } + } + } + return (ring->vr_lease != NULL); } static void @@ -1322,19 +1394,78 @@ viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) } } } + viona_ring_lease_drop(ring); mutex_exit(&ring->vr_lock); return (0); } +static boolean_t +viona_ring_map(viona_vring_t *ring) +{ + uint64_t pos = ring->vr_pa; + const uint16_t qsz = ring->vr_size; + + ASSERT3U(qsz, !=, 0); + ASSERT3U(pos, !=, 0); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + const size_t desc_sz = qsz * sizeof (struct virtio_desc); + ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz); + if (ring->vr_descr == NULL) { + goto fail; + } + pos += desc_sz; + + const size_t avail_sz = (qsz + 3) * sizeof (uint16_t); + ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz); + if (ring->vr_avail_flags == NULL) { + goto fail; + } + ring->vr_avail_idx = ring->vr_avail_flags + 1; + ring->vr_avail_ring = ring->vr_avail_flags + 2; + ring->vr_avail_used_event = ring->vr_avail_ring + qsz; + pos += avail_sz; + + const size_t used_sz = (qsz * sizeof (struct virtio_used)) + + (sizeof (uint16_t) * 3); + pos = P2ROUNDUP(pos, VRING_ALIGN); + ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz); + if (ring->vr_used_flags == NULL) { + goto fail; + } + ring->vr_used_idx = ring->vr_used_flags + 1; + ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2); + ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz); + + return (B_TRUE); + +fail: + viona_ring_unmap(ring); + return (B_FALSE); +} + +static void +viona_ring_unmap(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_descr = NULL; + ring->vr_avail_flags = NULL; + ring->vr_avail_idx = NULL; + ring->vr_avail_ring = NULL; + ring->vr_avail_used_event = NULL; + ring->vr_used_flags = NULL; + ring->vr_used_idx = NULL; + ring->vr_used_ring = NULL; + ring->vr_used_avail_event = NULL; +} + static int viona_ioc_ring_init(viona_link_t *link, void *udata, int md) { vioc_ring_init_t kri; viona_vring_t *ring; kthread_t *t; - uintptr_t pos; - size_t desc_sz, avail_sz, used_sz; - uint16_t cnt; int err = 0; if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { @@ -1344,8 +1475,8 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md) if (kri.ri_index >= VIONA_VQ_MAX) { return (EINVAL); } - cnt = kri.ri_qsize; - if (cnt == 0 || cnt > VRING_MAX_LEN || (1 << (ffs(cnt) - 1)) != cnt) { + const uint16_t qsz = kri.ri_qsize; + if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { return (EINVAL); } @@ -1357,39 +1488,19 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md) } VERIFY(ring->vr_state_flags == 0); - pos = kri.ri_qaddr; - desc_sz = cnt * sizeof (struct virtio_desc); - avail_sz = (cnt + 3) * sizeof (uint16_t); - used_sz = (cnt * sizeof (struct virtio_used)) + (sizeof (uint16_t) * 3); - - ring->vr_size = kri.ri_qsize; - ring->vr_mask = (ring->vr_size - 1); - ring->vr_descr = viona_gpa2kva(link, pos, desc_sz); - if (ring->vr_descr == NULL) { - err = EINVAL; + ring->vr_lease = NULL; + if (!viona_ring_lease_renew(ring)) { + err = EBUSY; goto fail; } - pos += desc_sz; - ring->vr_avail_flags = viona_gpa2kva(link, pos, avail_sz); - if (ring->vr_avail_flags == NULL) { + ring->vr_size = qsz; + ring->vr_mask = (ring->vr_size - 1); + ring->vr_pa = kri.ri_qaddr; + if (!viona_ring_map(ring)) { err = EINVAL; goto fail; } - ring->vr_avail_idx = ring->vr_avail_flags + 1; - ring->vr_avail_ring = ring->vr_avail_flags + 2; - ring->vr_avail_used_event = ring->vr_avail_ring + cnt; - pos += avail_sz; - - pos = P2ROUNDUP(pos, VRING_ALIGN); - ring->vr_used_flags = viona_gpa2kva(link, pos, used_sz); - if (ring->vr_used_flags == NULL) { - err = EINVAL; - goto fail; - } - ring->vr_used_idx = ring->vr_used_flags + 1; - ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2); - ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + cnt); /* Initialize queue indexes */ ring->vr_cur_aidx = 0; @@ -1398,9 +1509,9 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md) if (kri.ri_index == VIONA_VQ_TX && !link->l_force_tx_copy) { viona_desb_t *dp; - dp = kmem_zalloc(sizeof (viona_desb_t) * cnt, KM_SLEEP); + dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); ring->vr_txdesb = dp; - for (uint_t i = 0; i < cnt; i++, dp++) { + for (uint_t i = 0; i < qsz; i++, dp++) { dp->d_frtn.free_func = viona_desb_release; dp->d_frtn.free_arg = (void *)dp; dp->d_ring = ring; @@ -1411,7 +1522,7 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md) /* Allocate ring-sized iovec buffers for TX */ if (kri.ri_index == VIONA_VQ_TX) { - ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * cnt, + ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); } @@ -1434,18 +1545,10 @@ viona_ioc_ring_init(viona_link_t *link, void *udata, int md) return (0); fail: + viona_ring_lease_drop(ring); viona_ring_misc_free(ring); ring->vr_size = 0; ring->vr_mask = 0; - ring->vr_descr = NULL; - ring->vr_avail_flags = NULL; - ring->vr_avail_idx = NULL; - ring->vr_avail_ring = NULL; - ring->vr_avail_used_event = NULL; - ring->vr_used_flags = NULL; - ring->vr_used_idx = NULL; - ring->vr_used_ring = NULL; - ring->vr_used_avail_event = NULL; mutex_exit(&ring->vr_lock); return (err); } @@ -1591,6 +1694,25 @@ viona_worker_rx(viona_vring_t *ring, viona_link_t *link) *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; do { + if (vmm_drv_lease_expired(ring->vr_lease)) { + /* + * Set the renewal flag, causing incoming traffic to be + * dropped, and issue an RX barrier to ensure any + * threads in the RX callbacks will have finished. + * The vr_lock cannot be held across the barrier as it + * poses a deadlock risk. + */ + ring->vr_state_flags |= VRSF_RENEW; + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + /* * For now, there is little to do in the RX worker as inbound * data is delivered by MAC via the RX callbacks. If tap-like @@ -1617,6 +1739,7 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link) for (;;) { boolean_t bail = B_FALSE; + boolean_t renew = B_FALSE; uint_t ntx = 0; *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; @@ -1644,7 +1767,8 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link) */ membar_enter(); bail = VRING_NEED_BAIL(ring, p); - if (!bail && viona_vr_num_avail(ring)) { + renew = vmm_drv_lease_expired(ring->vr_lease); + if (!bail && !renew && viona_vr_num_avail(ring)) { continue; } @@ -1653,26 +1777,35 @@ viona_worker_tx(viona_vring_t *ring, viona_link_t *link) } mutex_enter(&ring->vr_lock); - while (!bail && !viona_vr_num_avail(ring)) { + + while (!bail && !renew && !viona_vr_num_avail(ring)) { (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); } + if (bail) { break; + } else if (renew) { + ring->vr_state_flags |= VRSF_RENEW; + /* + * When renewing the lease for the ring, no TX + * frames may be outstanding, as they contain + * references to guest memory. + */ + viona_tx_wait_outstanding(ring); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; } mutex_exit(&ring->vr_lock); } ASSERT(MUTEX_HELD(&ring->vr_lock)); - while (ring->vr_xfer_outstanding != 0) { - /* - * Paying heed to signals is counterproductive here. This is a - * very tight loop if pending transfers take an extended amount - * of time to be reclaimed while the host process is exiting. - */ - cv_wait(&ring->vr_cv, &ring->vr_lock); - } + viona_tx_wait_outstanding(ring); } static void @@ -1695,6 +1828,16 @@ viona_worker(void *arg) cv_broadcast(&ring->vr_cv); while (ring->vr_state_flags == 0) { + /* + * Keeping lease renewals timely while waiting for the ring to + * be started is important for avoiding deadlocks. + */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); if (VRING_NEED_BAIL(ring, p)) { @@ -1706,6 +1849,13 @@ viona_worker(void *arg) ring->vr_state = VRS_RUN; ring->vr_state_flags &= ~VRSF_REQ_START; + /* Ensure ring lease is valid first */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + /* Process actual work */ if (ring == &link->l_vrings[VIONA_VQ_RX]) { viona_worker_rx(ring, link); @@ -1725,6 +1875,7 @@ cleanup: } viona_ring_misc_free(ring); + viona_ring_lease_drop(ring); ring->vr_cur_aidx = 0; ring->vr_state = VRS_RESET; ring->vr_state_flags = 0; @@ -1799,7 +1950,6 @@ viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) static int vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie) { - viona_link_t *link = ring->vr_link; uint_t i, ndesc, idx, head, next; struct virtio_desc vdir; void *buf; @@ -1848,7 +1998,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie) VIONA_RING_STAT_INCR(ring, desc_bad_len); goto bail; } - buf = viona_gpa2kva(link, vdir.vd_addr, vdir.vd_len); + buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); if (buf == NULL) { VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); VIONA_RING_STAT_INCR(ring, bad_ring_addr); @@ -1868,7 +2018,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie) VIONA_RING_STAT_INCR(ring, indir_bad_len); goto bail; } - vindir = viona_gpa2kva(link, vdir.vd_addr, vdir.vd_len); + vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); if (vindir == NULL) { VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); VIONA_RING_STAT_INCR(ring, bad_ring_addr); @@ -1901,7 +2051,7 @@ vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie) desc_bad_len); goto bail; } - buf = viona_gpa2kva(link, vp.vd_addr, + buf = viona_gpa2kva(ring, vp.vd_addr, vp.vd_len); if (buf == NULL) { VIONA_PROBE_BAD_RING_ADDR(ring, @@ -2004,7 +2154,7 @@ viona_intr_ring(viona_vring_t *ring) uint64_t msg = ring->vr_msi_msg; mutex_exit(&ring->vr_lock); - (void) vmm_drv_msi(ring->vr_link->l_vm_hold, addr, msg); + (void) vmm_drv_msi(ring->vr_lease, addr, msg); return; } mutex_exit(&ring->vr_lock); @@ -2528,8 +2678,9 @@ viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp, { viona_vring_t *ring = (viona_vring_t *)arg; - /* Immediately drop traffic if ring is inactive */ - if (ring->vr_state != VRS_RUN) { + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { freemsgchain(mp); return; } @@ -2546,8 +2697,9 @@ viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp, mblk_t *mp_mcast_only = NULL; mblk_t **mpp = &mp_mcast_only; - /* Immediately drop traffic if ring is inactive */ - if (ring->vr_state != VRS_RUN) { + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { freemsgchain(mp); return; } @@ -2651,6 +2803,21 @@ viona_desb_release(viona_desb_t *dp) mutex_exit(&ring->vr_lock); } +static void +viona_tx_wait_outstanding(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + while (ring->vr_xfer_outstanding != 0) { + /* + * Paying heed to signals is counterproductive here. This is a + * very tight loop if pending transfers take an extended amount + * of time to be reclaimed while the host process is exiting. + */ + cv_wait(&ring->vr_cv, &ring->vr_lock); + } +} + static boolean_t viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, mblk_t *mp, uint32_t len) diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 991d6c7850..47a5f26cb7 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -197,7 +197,6 @@ struct vm { uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ #ifndef __FreeBSD__ - krwlock_t ioport_rwlock; list_t ioport_hooks; #endif /* __FreeBSD__ */ }; @@ -526,7 +525,6 @@ vm_init(struct vm *vm, bool create) vm->vrtc = vrtc_init(vm); #ifndef __FreeBSD__ if (create) { - rw_init(&vm->ioport_rwlock, NULL, RW_DEFAULT, NULL); list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t), offsetof (vm_ioport_hook_t, vmih_node)); } else { @@ -3135,7 +3133,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, return (EINVAL); } - rw_enter(&vm->ioport_rwlock, RW_WRITER); /* * Find the node position in the list which this region should be * inserted behind to maintain sorted order. @@ -3143,7 +3140,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) { if (ioport == node->vmih_ioport) { /* Reject duplicate port hook */ - rw_exit(&vm->ioport_rwlock); return (EEXIST); } else if (ioport > node->vmih_ioport) { break; @@ -3162,7 +3158,6 @@ vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, } *cookie = (void *)hook; - rw_exit(&vm->ioport_rwlock); return (0); } @@ -3172,12 +3167,10 @@ vm_ioport_unhook(struct vm *vm, void **cookie) vm_ioport_hook_t *hook; list_t *ih = &vm->ioport_hooks; - rw_enter(&vm->ioport_rwlock, RW_WRITER); hook = *cookie; list_remove(ih, hook); kmem_free(hook, sizeof (*hook)); *cookie = NULL; - rw_exit(&vm->ioport_rwlock); } int @@ -3188,38 +3181,32 @@ vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes, list_t *ih = &vm->ioport_hooks; int err = 0; - rw_enter(&vm->ioport_rwlock, RW_READER); for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) { if (hook->vmih_ioport == port) { break; } } if (hook == NULL) { - err = ENOENT; - goto bail; + return (ENOENT); } if (in) { uint64_t tval; if (hook->vmih_rmem_cb == NULL) { - err = ENOENT; - goto bail; + return (ENOENT); } err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port, (uint_t)bytes, &tval); *val = (uint32_t)tval; } else { if (hook->vmih_wmem_cb == NULL) { - err = ENOENT; - goto bail; + return (ENOENT); } err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port, (uint_t)bytes, (uint64_t)*val); } -bail: - rw_exit(&vm->ioport_rwlock); return (err); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile index 2059dfcc97..83c14de895 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.mapfile +++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile @@ -39,7 +39,10 @@ SYMBOL_VERSION ILLUMOSprivate { # bhyve driver API vmm_drv_hold; vmm_drv_rele; - vmm_drv_expired; + vmm_drv_release_reqd; + vmm_drv_lease_sign; + vmm_drv_lease_break; + vmm_drv_lease_expired; vmm_drv_gpa2kva; vmm_drv_ioport_hook; vmm_drv_ioport_unhook; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index d20732ee1e..a5e60d4887 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -85,11 +85,22 @@ extern int vmx_x86_supported(const char **); struct vmm_hold { list_node_t vmh_node; vmm_softc_t *vmh_sc; - boolean_t vmh_expired; + boolean_t vmh_release_req; uint_t vmh_ioport_hook_cnt; }; +struct vmm_lease { + list_node_t vml_node; + struct vm *vml_vm; + boolean_t vml_expired; + boolean_t (*vml_expire_func)(void *); + void *vml_expire_arg; + list_node_t vml_expire_node; + struct vmm_hold *vml_hold; +}; + static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); +static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *); static int vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) @@ -224,63 +235,164 @@ vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) return (error); } +/* + * Resource Locking and Exclusion + * + * Much of bhyve depends on key portions of VM state, such as the guest memory + * map, to remain unchanged while the guest is running. As ported from + * FreeBSD, the initial strategy for this resource exclusion hinged on gating + * access to the instance vCPUs. Threads acting on a single vCPU, like those + * performing the work of actually running the guest in VMX/SVM, would lock + * only that vCPU during ioctl() entry. For ioctls which would change VM-wide + * state, all of the vCPUs would be first locked, ensuring that the + * operation(s) could complete without any other threads stumbling into + * intermediate states. + * + * This approach is largely effective for bhyve. Common operations, such as + * running the vCPUs, steer clear of lock contention. The model begins to + * break down for operations which do not occur in the context of a specific + * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker + * thread in the bhyve process. In order to properly protect those vCPU-less + * operations from encountering invalid states, additional locking is required. + * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. + * It does mean that class of operations will be serialized on locking the + * specific vCPU and that instances sized at VM_MAXCPU will potentially see + * undue contention on the VM_MAXCPU-1 vCPU. + * + * In order to address the shortcomings of this model, the concept of a + * read/write lock has been added to bhyve. Operations which change + * fundamental aspects of a VM (such as the memory map) must acquire the write + * lock, which also implies locking all of the vCPUs and waiting for all read + * lock holders to release. While it increases the cost and waiting time for + * those few operations, it allows most hot-path operations on the VM (which + * depend on its configuration remaining stable) to occur with minimal locking. + * + * Consumers of the Driver API (see below) are a special case when it comes to + * this locking, since they may hold a read lock via the drv_lease mechanism + * for an extended period of time. Rather than forcing those consumers to + * continuously poll for a write lock attempt, the lease system forces them to + * provide a release callback to trigger their clean-up (and potential later + * reacquisition) of the read lock. + */ -static int +static void vcpu_lock_one(vmm_softc_t *sc, int vcpu) { - int error; - - if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) - return (EINVAL); + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); - error = vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true); - return (error); + /* + * Since this state transition is utilizing from_idle=true, it should + * not fail, but rather block until it can be successful. + */ + VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); } static void vcpu_unlock_one(vmm_softc_t *sc, int vcpu) { - enum vcpu_state state; - - state = vcpu_get_state(sc->vmm_vm, vcpu, NULL); - if (state != VCPU_FROZEN) { - panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vmm_vm), - vcpu, state); - } + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); + VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); } -static int -vcpu_lock_all(vmm_softc_t *sc) +static void +vmm_read_lock(vmm_softc_t *sc) +{ + rw_enter(&sc->vmm_rwlock, RW_READER); +} + +static void +vmm_read_unlock(vmm_softc_t *sc) +{ + rw_exit(&sc->vmm_rwlock); +} + +static void +vmm_write_lock(vmm_softc_t *sc) { - int error = 0, vcpu; - uint16_t maxcpus; + int maxcpus; + /* First lock all the vCPUs */ maxcpus = vm_get_maxcpus(sc->vmm_vm); - for (vcpu = 0; vcpu < maxcpus; vcpu++) { - error = vcpu_lock_one(sc, vcpu); - if (error) - break; + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_lock_one(sc, vcpu); } - if (error) { - while (--vcpu >= 0) - vcpu_unlock_one(sc, vcpu); + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); + sc->vmm_lease_blocker++; + if (sc->vmm_lease_blocker == 1) { + list_t *list = &sc->vmm_lease_list; + vmm_lease_t *lease = list_head(list); + + while (lease != NULL) { + boolean_t sync_break = B_FALSE; + + if (!lease->vml_expired) { + void *arg = lease->vml_expire_arg; + lease->vml_expired = B_TRUE; + sync_break = lease->vml_expire_func(arg); + } + + if (sync_break) { + vmm_lease_t *next; + + /* + * These leases which are synchronously broken + * result in vmm_read_unlock() calls from a + * different thread than the corresponding + * vmm_read_lock(). This is acceptable, given + * that the rwlock underpinning the whole + * mechanism tolerates the behavior. This + * flexibility is _only_ afforded to VM read + * lock (RW_READER) holders. + */ + next = list_next(list, lease); + vmm_lease_break_locked(sc, lease); + lease = next; + } else { + lease = list_next(list, lease); + } + } } + mutex_exit(&sc->vmm_lease_lock); - return (error); + rw_enter(&sc->vmm_rwlock, RW_WRITER); + /* + * For now, the 'maxcpus' value for an instance is fixed at the + * compile-time constant of VM_MAXCPU at creation. If this changes in + * the future, allowing for dynamic vCPU resource sizing, acquisition + * of the write lock will need to be wary of such changes. + */ + VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); } static void -vcpu_unlock_all(vmm_softc_t *sc) +vmm_write_unlock(vmm_softc_t *sc) { - int vcpu; - uint16_t maxcpus; + int maxcpus; + + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, 0); + sc->vmm_lease_blocker--; + if (sc->vmm_lease_blocker == 0) { + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); + /* + * The VM write lock _must_ be released from the same thread it was + * acquired in, unlike the read lock. + */ + VERIFY(rw_write_held(&sc->vmm_rwlock)); + rw_exit(&sc->vmm_rwlock); + + /* Unlock all the vCPUs */ maxcpus = vm_get_maxcpus(sc->vmm_vm); - for (vcpu = 0; vcpu < maxcpus; vcpu++) + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { vcpu_unlock_one(sc, vcpu); + } } static int @@ -289,11 +401,14 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, { int error = 0, vcpu = -1; void *datap = (void *)arg; - boolean_t locked_one = B_FALSE, locked_all = B_FALSE; - - /* - * Some VMM ioctls can operate only on vcpus that are not running. - */ + enum vm_lock_type { + LOCK_NONE = 0, + LOCK_VCPU, + LOCK_READ_HOLD, + LOCK_WRITE_HOLD + } lock_type = LOCK_NONE; + + /* Acquire any exclusion resources needed for the operation. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: @@ -324,53 +439,52 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { return (EFAULT); } - if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) { - error = EINVAL; - goto done; + if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { + return (EINVAL); } - - error = vcpu_lock_one(sc, vcpu); - if (error) - goto done; - locked_one = B_TRUE; + vcpu_lock_one(sc, vcpu); + lock_type = LOCK_VCPU; break; - case VM_MAP_PPTDEV_MMIO: + case VM_REINIT: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: + case VM_MAP_PPTDEV_MMIO: case VM_ALLOC_MEMSEG: case VM_MMAP_MEMSEG: - case VM_REINIT: - /* - * All vCPUs must be prevented from running when performing - * operations which act upon the entire VM. - */ - error = vcpu_lock_all(sc); - if (error) - goto done; - locked_all = B_TRUE; + case VM_WRLOCK_CYCLE: + vmm_write_lock(sc); + lock_type = LOCK_WRITE_HOLD; break; + case VM_GET_GPA_PMAP: case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: + case VM_LAPIC_IRQ: + case VM_INJECT_NMI: + case VM_IOAPIC_ASSERT_IRQ: + case VM_IOAPIC_DEASSERT_IRQ: + case VM_IOAPIC_PULSE_IRQ: + case VM_LAPIC_MSI: + case VM_LAPIC_LOCAL_IRQ: + case VM_GET_X2APIC_STATE: + case VM_RTC_READ: + case VM_RTC_WRITE: + case VM_RTC_SETTIME: + case VM_RTC_GETTIME: #ifndef __FreeBSD__ case VM_DEVMEM_GETOFFSET: #endif - /* - * Lock a vcpu to make sure that the memory map cannot be - * modified while it is being inspected. - */ - vcpu = vm_get_maxcpus(sc->vmm_vm) - 1; - error = vcpu_lock_one(sc, vcpu); - if (error) - goto done; - locked_one = B_TRUE; + vmm_read_lock(sc); + lock_type = LOCK_READ_HOLD; break; + case VM_IOAPIC_PINCOUNT: default: break; } + /* Execute the primary logic for the ioctl. */ switch (cmd) { case VM_RUN: { struct vm_run vmrun; @@ -978,27 +1092,17 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_SUSPEND_CPU: if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { error = EFAULT; - break; - } - if (vcpu < -1 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) { - error = EINVAL; - break; + } else { + error = vm_suspend_cpu(sc->vmm_vm, vcpu); } - - error = vm_suspend_cpu(sc->vmm_vm, vcpu); break; case VM_RESUME_CPU: if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { error = EFAULT; - break; - } - if (vcpu < -1 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) { - error = EINVAL; - break; + } else { + error = vm_resume_cpu(sc->vmm_vm, vcpu); } - - error = vm_resume_cpu(sc->vmm_vm, vcpu); break; case VM_GET_CPUS: { @@ -1167,22 +1271,37 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, } break; } + case VM_WRLOCK_CYCLE: { + /* + * Present a test mechanism to acquire/release the write lock + * on the VM without any other effects. + */ + break; + } #endif default: error = ENOTTY; break; } - /* Release any vCPUs that were locked for the operation */ - if (locked_one) { + /* Release exclusion resources */ + switch (lock_type) { + case LOCK_NONE: + break; + case LOCK_VCPU: vcpu_unlock_one(sc, vcpu); - } else if (locked_all) { - vcpu_unlock_all(sc); + break; + case LOCK_READ_HOLD: + vmm_read_unlock(sc); + break; + case LOCK_WRITE_HOLD: + vmm_write_unlock(sc); + break; + default: + panic("unexpected lock type"); + break; } -done: - /* Make sure that no handler returns a bogus value like ERESTART */ - KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } @@ -1251,10 +1370,17 @@ vmmdev_do_vm_create(char *name, cred_t *cr) sc->vmm_minor = minor; list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), offsetof(vmm_devmem_entry_t, vde_node)); + list_create(&sc->vmm_holds, sizeof (vmm_hold_t), offsetof(vmm_hold_t, vmh_node)); cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), + offsetof(vmm_lease_t, vml_node)); + cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); + rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); + sc->vmm_zone = crgetzone(cr); zone_hold(sc->vmm_zone); vmm_zsd_add_vm(sc); @@ -1275,6 +1401,23 @@ fail: return (error); } +/* + * Bhyve 'Driver' Interface + * + * While many devices are emulated in the bhyve userspace process, there are + * others with performance constraints which require that they run mostly or + * entirely in-kernel. For those not integrated directly into bhyve, an API is + * needed so they can query/manipulate the portions of VM state needed to + * fulfill their purpose. + * + * This includes: + * - Translating guest-physical addresses to host-virtual pointers + * - Injecting MSIs + * - Hooking IO port addresses + * + * The vmm_drv interface exists to provide that functionality to its consumers. + * (At this time, 'viona' is the only user) + */ int vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) { @@ -1311,7 +1454,8 @@ vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); hold->vmh_sc = sc; - hold->vmh_expired = B_FALSE; + hold->vmh_release_req = B_FALSE; + list_insert_tail(&sc->vmm_holds, hold); sc->vmm_flags |= VMM_HELD; *holdp = hold; @@ -1342,25 +1486,87 @@ vmm_drv_rele(vmm_hold_t *hold) } boolean_t -vmm_drv_expired(vmm_hold_t *hold) +vmm_drv_release_reqd(vmm_hold_t *hold) { ASSERT(hold != NULL); - return (hold->vmh_expired); + return (hold->vmh_release_req); +} + +vmm_lease_t * +vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) +{ + vmm_softc_t *sc = hold->vmh_sc; + vmm_lease_t *lease; + + ASSERT3P(expiref, !=, NULL); + + if (hold->vmh_release_req) { + return (NULL); + } + + lease = kmem_alloc(sizeof (*lease), KM_SLEEP); + list_link_init(&lease->vml_node); + lease->vml_expire_func = expiref; + lease->vml_expire_arg = arg; + lease->vml_expired = B_FALSE; + lease->vml_hold = hold; + /* cache the VM pointer for one less pointer chase */ + lease->vml_vm = sc->vmm_vm; + + mutex_enter(&sc->vmm_lease_lock); + while (sc->vmm_lease_blocker != 0) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + list_insert_tail(&sc->vmm_lease_list, lease); + vmm_read_lock(sc); + mutex_exit(&sc->vmm_lease_lock); + + return (lease); +} + +static void +vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) +{ + ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); + + list_remove(&sc->vmm_lease_list, lease); + vmm_read_unlock(sc); + kmem_free(lease, sizeof (*lease)); +} + +void +vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) +{ + vmm_softc_t *sc = hold->vmh_sc; + + VERIFY3P(hold, ==, lease->vml_hold); + + mutex_enter(&sc->vmm_lease_lock); + vmm_lease_break_locked(sc, lease); + mutex_exit(&sc->vmm_lease_lock); +} + +boolean_t +vmm_drv_lease_expired(vmm_lease_t *lease) +{ + return (lease->vml_expired); } void * -vmm_drv_gpa2kva(vmm_hold_t *hold, uintptr_t gpa, size_t sz) +vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) { - struct vm *vm; - struct vmspace *vmspace; + ASSERT(lease != NULL); - ASSERT(hold != NULL); + return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); +} - vm = hold->vmh_sc->vmm_vm; - vmspace = vm_get_vmspace(vm); +int +vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) +{ + ASSERT(lease != NULL); - return (vmspace_find_kva(vmspace, gpa, sz)); + return (lapic_intr_msi(lease->vml_vm, addr, msg)); } int @@ -1387,8 +1593,10 @@ vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc, hold->vmh_ioport_hook_cnt++; mutex_exit(&vmm_mtx); + vmm_write_lock(sc); err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc, (vmm_wmem_cb_t)wfunc, arg, cookie); + vmm_write_unlock(sc); if (err != 0) { mutex_enter(&vmm_mtx); @@ -1409,24 +1617,15 @@ vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) ASSERT(hold->vmh_ioport_hook_cnt != 0); sc = hold->vmh_sc; + vmm_write_lock(sc); vm_ioport_unhook(sc->vmm_vm, cookie); + vmm_write_unlock(sc); mutex_enter(&vmm_mtx); hold->vmh_ioport_hook_cnt--; mutex_exit(&vmm_mtx); } -int -vmm_drv_msi(vmm_hold_t *hold, uint64_t addr, uint64_t msg) -{ - struct vm *vm; - - ASSERT(hold != NULL); - - vm = hold->vmh_sc->vmm_vm; - return (lapic_intr_msi(vm, addr, msg)); -} - static int vmm_drv_purge(vmm_softc_t *sc) { @@ -1438,7 +1637,7 @@ vmm_drv_purge(vmm_softc_t *sc) sc->vmm_flags |= VMM_CLEANUP; for (hold = list_head(&sc->vmm_holds); hold != NULL; hold = list_next(&sc->vmm_holds, hold)) { - hold->vmh_expired = B_TRUE; + hold->vmh_release_req = B_TRUE; } while ((sc->vmm_flags & VMM_HELD) != 0) { if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { @@ -1730,10 +1929,8 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, if (sc->vmm_flags & VMM_DESTROY) return (ENXIO); - /* Get a read lock on the guest memory map by freezing any vcpu. */ - if ((err = vcpu_lock_all(sc)) != 0) { - return (err); - } + /* Grab read lock on the VM to prevent any changes to the memory map */ + vmm_read_lock(sc); vm = sc->vmm_vm; vms = vm_get_vmspace(vm); @@ -1758,7 +1955,7 @@ vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, out: - vcpu_unlock_all(sc); + vmm_read_unlock(sc); return (err); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c index 58a62586a1..66a67d9529 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -148,8 +148,13 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) vmspace_mapping_t *vmsm; void *result = NULL; - mutex_enter(&vms->vms_lock); - vmsm = vm_mapping_find(vms, addr, size, B_FALSE); + /* + * Since vmspace_find_kva is provided so that vmm_drv consumers can do + * GPA2KVA translations, it is expected to be called when there is a + * read lock preventing vmspace alterations. As such, it can do the + * lockless vm_mapping_find() lookup. + */ + vmsm = vm_mapping_find(vms, addr, size, B_TRUE); if (vmsm != NULL) { struct vm_object *vmo = vmsm->vmsm_object; @@ -162,7 +167,6 @@ vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) break; } } - mutex_exit(&vms->vms_lock); return (result); } diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 63ccc36dc6..dd87dcb0a6 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -38,7 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_DEV_H_ @@ -387,6 +387,7 @@ enum { #ifndef __FreeBSD__ /* illumos-custom ioctls */ IOCNUM_DEVMEM_GETOFFSET = 256, + IOCNUM_WRLOCK_CYCLE = 257, #endif }; @@ -504,6 +505,7 @@ enum { #ifndef __FreeBSD__ #define VM_DEVMEM_GETOFFSET \ _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset) +#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE) /* ioctls used against ctl device for vm create/destroy */ #define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h index b883070abf..33fefc10ea 100644 --- a/usr/src/uts/i86pc/sys/vmm_drv.h +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_DRV_H_ @@ -20,6 +20,9 @@ struct vmm_hold; typedef struct vmm_hold vmm_hold_t; +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + /* * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t * counterparts in vmm.h. @@ -29,12 +32,19 @@ typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); extern void vmm_drv_rele(vmm_hold_t *); -extern boolean_t vmm_drv_expired(vmm_hold_t *); -extern void *vmm_drv_gpa2kva(vmm_hold_t *, uintptr_t, size_t); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t); +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t, vmm_drv_wmem_cb_t, void *, void **); extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); -extern int vmm_drv_msi(vmm_hold_t *, uint64_t, uint64_t); #endif /* _KERNEL */ #endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h index 8fa19c8247..cdc56cc464 100644 --- a/usr/src/uts/i86pc/sys/vmm_impl.h +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -11,7 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_IMPL_H_ @@ -46,7 +46,7 @@ typedef struct vmm_devmem_entry vmm_devmem_entry_t; typedef struct vmm_zsd vmm_zsd_t; enum vmm_softc_state { - VMM_HELD = 1, /* external driver(s) possess hold on VM */ + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ VMM_CLEANUP = 2, /* request that holds are released */ VMM_PURGED = 4, /* all hold have been released */ VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ @@ -58,11 +58,18 @@ struct vmm_softc { struct vm *vmm_vm; minor_t vmm_minor; char vmm_name[VM_MAX_NAMELEN]; - uint_t vmm_flags; - boolean_t vmm_is_open; list_t vmm_devmem_list; - list_t vmm_holds; + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; + + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; /* For zone specific data */ list_node_t vmm_zsd_linkage; |
