summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Mustacchi <rm@joyent.com>2012-09-24 21:51:53 +0000
committerRobert Mustacchi <rm@joyent.com>2012-09-24 21:51:53 +0000
commit6d85df9c5991c26ead6195ef6eed31e604b14db5 (patch)
tree389971ccb7da7b6e664b86feb9394f17db672901
parenta28b557f30616c2780163ea7a52aa72477f89ad9 (diff)
downloadillumos-kvm-cmd-6d85df9c5991c26ead6195ef6eed31e604b14db5.tar.gz
HVM-750 guest virtio drivers are racy with respect to interrupts20121004
HVM-751 CONFIGURE_ONLY check in build.sh is wrong HVM-752 Import qemu-kvm.git barrier changes
-rwxr-xr-xbuild.sh4
-rw-r--r--hw/virtio-net.c176
-rw-r--r--hw/virtio.c85
-rw-r--r--hw/virtio.h4
-rw-r--r--qemu-barrier.h61
-rw-r--r--qemu_mdb.c117
6 files changed, 427 insertions, 20 deletions
diff --git a/build.sh b/build.sh
index 415f13e..6618a05 100755
--- a/build.sh
+++ b/build.sh
@@ -75,9 +75,9 @@ KERNEL_SOURCE="${KERNEL_SOURCE:-$(pwd)/../../illumos}"
CTFBINDIR="$KERNEL_SOURCE"/usr/src/tools/proto/root_i386-nd/opt/onbld/bin/i386
export PATH="$PATH:$CTFBINDIR"
-if [[ -z "CONFIGURE_ONLY" ]]; then
+if [[ -z "$CONFIGURE_ONLY" ]]; then
echo "==> Make"
- gmake
+ V=1 gmake
else
echo "Not running make per-request"
fi
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index e9775a6..bb7f49a 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -26,6 +26,89 @@
#define MAC_TABLE_ENTRIES 64
#define MAX_VLAN (1 << 12) /* Per 802.1Q definition */
+/*
+ * Unfortunately some guest virtio drivers are a little racy with respect to
+ * when they notify us and when they unmask their respective interrupts.
+ * Currently we have to work around this in QEMU. While OSes normally work
+ * around pathological devices, virtual devices here will have to work around
+ * virtual hardware. To put this more concretely, a Linux guest will notify the
+ * host to do processing work before it unmasks interrupts. Therefore, by the
+ * time that we get to virtio_notify interrupts on the available ring won't be
+ * unmasked so we won't inject the interrupt, but the guest will instead wait
+ * indefinitely for one. This leads to us losing data.
+ *
+ * We need to note whether or not we injected an interrupt during a
+ * virtio_notify. If we did not and either of the following conditions about the
+ * ring buffers are true:
+ *
+ * o The last available index processed equals the used index
+ * o The last available index processed does not equal the current
+ * available index
+ *
+ * If this is the case, then we set up a small timer that runs for 500 ticks,
+ * each tick is 10ms long. If we reach 500 ticks, then we just ignore it. This
+ * is actually a valid position because the guest could have transmitted a small
+ * amount of packets, but not enough to actually cause it to need injection. If
+ * we get notified, aka hit virtio_net_handle_tx_timer, then we stop the timer,
+ * because we're about to do processing that may inject an interrupt. Finally,
+ * if on a tick we check two different conditions. The first is to see if the
+ * last processed available ring index is not equal to the current available
+ * ring index. If that is true, then we effectively call virtqueue_flush as
+ * virtio_net_tx_timer would. Finally we check if the last available ring index
+ * is equal to the used ring index and interrupts are not masked. If this is the
+ * case, then we simply inject the interrupt and continue.
+ *
+ * This is summarized by the following rough state transition diagram:
+ *
+ * Otherwise +---+
+ * virtqueue_ --+ increment +---* |
+ * flush() | tick count \|/ | + avail ring
+ * finishes | +-------------+ | | index >
+ * without +---*-------------------->| |--+ | last avail
+ * injecting| | Timer | | index pro-
+ * an intr. | +-----*-------------| Active | | cessed
+ * | | | | |-----*-----------+
+ * | | | +-------------+ |
+ * | | +- 500 ticks | | |
+ * | | elapse | *--+ Avail ring |
+ * | \|/ | | unmasked |
+ * +-------------+ | | |
+ * | |<--*-----------+ | +--------+ |
+ * | Timer | | | | | |
+ * | Inactive | +- virtio_net_ +---->| Inject | |
+ * | | handle_tx_ | MSI/x | |
+ * +-------------+ timer() runs | | |
+ * ^ ^ +--------+ |
+ * | | +- always | |
+ * | | | | |
+ * | +-----------------------*------------+ |
+ * | |
+ * | +- always +------------------+ |
+ * | | | | |
+ * +---------------*---------------| Flush Virtqueues |<-----+
+ * | |
+ * +------------------+
+ */
+
+
+#define REINJECT_TICK_RATE (10000000) /* 10ms in ns */
+#define REINJECT_DEADMAN 500 /* 5s in ticks */
+
+typedef enum rein_act {
+ REIN_INJECT,
+ REIN_DEADMAN,
+ REIN_RUN
+} rein_act_t;
+
+#define REIN_RING_MAX 64
+
+typedef struct rein_event {
+ rein_act_t re_act;
+ hrtime_t re_time;
+ uint64_t re_other;
+ struct timeval re_tval;
+} rein_event_t;
+
typedef struct VirtIONet
{
VirtIODevice vdev;
@@ -63,8 +146,78 @@ typedef struct VirtIONet
} mac_table;
uint32_t *vlans;
DeviceState *qdev;
+ QEMUTimer *rein_timer;
+ uint32_t rein_timer_ticks;
+ uint8_t rein_timer_act;
+ uint32_t rein_ring_idx;
+ rein_event_t rein_ring[REIN_RING_MAX];
+ uint64_t rein_n_dead;
+ uint64_t rein_n_inject;
+ uint64_t rein_n_rerun;
} VirtIONet;
+static void virtio_net_handle_tx_timer(VirtIODevice *, VirtQueue *);
+
+static void virtio_net_rein_event(VirtIONet *n, rein_act_t act, uint64_t other)
+{
+ int index = n->rein_ring_idx;
+ n->rein_ring_idx = (n->rein_ring_idx + 1) % REIN_RING_MAX;
+ rein_event_t *rep = n->rein_ring + index;
+ rep->re_time = gethrtime();
+ rep->re_act = act;
+ rep->re_other = other;
+ (void) gettimeofday(&rep->re_tval, NULL);
+}
+
+static void virtio_net_rein_disable(VirtIONet *n)
+{
+ qemu_del_timer(n->rein_timer);
+ n->rein_timer_act = 0;
+}
+
+static void virtio_net_rein_enable(VirtIONet *n)
+{
+ n->rein_timer_ticks = 0;
+ qemu_mod_timer(n->rein_timer,
+ qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
+ n->rein_timer_act = 1;
+}
+
+static void virtio_net_rein_tick(void *opaque)
+{
+ int ret;
+ VirtIONet *n = opaque;
+ assert(n->rein_timer_act);
+
+ n->rein_timer_ticks++;
+
+ /* Give up, this may be completely reasonable */
+ if (n->rein_timer_ticks > REINJECT_DEADMAN) {
+ virtio_net_rein_event(n, REIN_DEADMAN, n->rein_timer_ticks);
+ virtio_net_rein_disable(n);
+ n->rein_n_dead++;
+ return;
+ }
+
+ ret = virtqueue_stalled(n->tx_vq);
+ if (ret == 1) {
+ virtio_net_rein_event(n, REIN_INJECT, n->rein_timer_ticks);
+ virtio_net_rein_disable(n);
+ n->rein_n_inject++;
+ return;
+ } else if (ret == 2) {
+ virtio_net_rein_event(n, REIN_RUN, n->rein_timer_ticks);
+ virtio_net_rein_disable(n);
+ virtio_net_handle_tx_timer(&n->vdev, n->tx_vq);
+ n->rein_n_rerun++;
+ return;
+ }
+
+ assert(ret == 0);
+ qemu_mod_timer(n->rein_timer,
+ qemu_get_clock(vm_clock) + REINJECT_TICK_RATE);
+}
+
/* TODO
* - we could suppress RX interrupt if we were so inclined.
*/
@@ -707,6 +860,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
{
VirtQueueElement elem;
int32_t num_packets = 0;
+ int32_t inject = 1;
if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
return num_packets;
}
@@ -758,12 +912,16 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
len += ret;
virtqueue_push(vq, &elem, len);
- virtio_notify(&n->vdev, vq);
+ inject = virtio_notify(&n->vdev, vq);
if (++num_packets >= n->tx_burst) {
break;
}
}
+
+ if (inject == 0 && virtqueue_handled(vq))
+ virtio_net_rein_enable(n);
+
return num_packets;
}
@@ -777,6 +935,16 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
return;
}
+ /*
+ * Kill the broken guest timer. The reason we are here is because the guest
+ * has kicked us to send packets therefore we don't need to go back and
+ * consider injecting it with interrupts because we will do that again
+ * naturally. We also don't reset
+ */
+ if (n->rein_timer_act)
+ virtio_net_rein_disable(n);
+
+
if (n->tx_waiting) {
virtio_queue_set_notification(vq, 1);
qemu_del_timer(n->tx_timer);
@@ -1024,6 +1192,12 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh);
n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
}
+ n->rein_timer = qemu_new_timer(vm_clock, virtio_net_rein_tick, n);
+ n->rein_ring_idx = 0;
+ bzero(n->rein_ring, sizeof (rein_event_t) * REIN_RING_MAX);
+ n->rein_n_dead = 0;
+ n->rein_n_inject = 0;
+ n->rein_n_rerun = 0;
n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
qemu_macaddr_default_if_unset(&conf->macaddr);
memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac));
diff --git a/hw/virtio.c b/hw/virtio.c
index 31bd9e3..e9f5cdc 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -17,20 +17,12 @@
#include "qemu-error.h"
#include "virtio.h"
#include "sysemu.h"
+#include "qemu-barrier.h"
/* The alignment to use between consumer and producer parts of vring.
* x86 pagesize again. */
#define VIRTIO_PCI_VRING_ALIGN 4096
-/* QEMU doesn't strictly need write barriers since everything runs in
- * lock-step. We'll leave the calls to wmb() in though to make it obvious for
- * KVM or if kqemu gets SMP support.
- * In any case, we must prevent the compiler from reordering the code.
- * TODO: we likely need some rmb()/mb() as well.
- */
-
-#define wmb() __asm__ __volatile__("": : :"memory")
-
typedef struct VRingDesc
{
uint64_t addr;
@@ -169,6 +161,13 @@ static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val)
stw_phys(pa, vring_used_idx(vq) + val);
}
+static inline uint16_t vring_used_flags(VirtQueue *vq)
+{
+ target_phys_addr_t pa;
+ pa = vq->vring.used + offsetof(VRingUsed, flags);
+ return lduw_phys(pa);
+}
+
static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
{
target_phys_addr_t pa;
@@ -235,7 +234,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
void virtqueue_flush(VirtQueue *vq, unsigned int count)
{
/* Make sure buffer is written before we update index. */
- wmb();
+ smp_wmb();
trace_virtqueue_flush(vq, count);
vring_used_idx_increment(vq, count);
vq->inuse -= count;
@@ -258,6 +257,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
idx, vring_avail_idx(vq));
exit(1);
}
+ /* On success, callers read a descriptor at vq->last_avail_idx.
+ * Make sure descriptor read does not bypass avail index read. */
+ if (num_heads) {
+ smp_rmb();
+ }
return num_heads;
}
@@ -291,7 +295,7 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
/* Check they're not leading us off end of descriptors. */
next = vring_desc_next(desc_pa, i);
/* Make sure compiler knows to grab that: we don't want it changing! */
- wmb();
+ smp_wmb();
if (next >= max) {
error_report("Desc next is %u", next);
@@ -629,17 +633,20 @@ void virtio_irq(VirtQueue *vq)
virtio_notify_vector(vq->vdev, vq->vector);
}
-void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
+int virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
+ /* We need to expose used array entries before checking used event. */
+ smp_mb();
/* Always notify when queue is empty (when feature acknowledge) */
if ((vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) &&
(!(vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) ||
(vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
- return;
+ return 0;
trace_virtio_notify(vdev, vq);
vdev->isr |= 0x01;
virtio_notify_vector(vdev, vq->vector);
+ return 1;
}
void virtio_notify_config(VirtIODevice *vdev)
@@ -880,3 +887,55 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
{
return &vq->host_notifier;
}
+
+int virtqueue_handled(VirtQueue *vq)
+{
+ smp_mb();
+ return (vq->last_avail_idx == vring_used_idx(vq) ||
+ vq->last_avail_idx != vring_avail_idx(vq));
+}
+
+/*
+ * We need to go through and check if we have hit the 'stalled' condition.
+ * Due to the way that the virtio driver is implemented in the Linux kernel, it
+ * will potentially kick the guest to process data, disable the queue, but not
+ * enable interrupts before the host is done processing packets. When this
+ * happens all network traffic from the guest ends up getting corked up because
+ * the guest disabled the queue and is waiting for an interrupt from the host to
+ * go and enable it again. In fact, when in this state a little bit of libproc
+ * magic gets us going again rather reliably.
+ *
+ * Eventually the guest will go through and unmask interrupts saying that it
+ * wants an injection. If we reach a point in time where the last seen available
+ * index is equal to the available index ring and is equal to the used index
+ * ring, then we'll go ahead and install the interupt.
+ */
+int virtqueue_stalled(VirtQueue *vq)
+{
+ smp_mb();
+
+ if (vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT)
+ return (0);
+
+ if (vring_used_flags(vq) & VRING_USED_F_NO_NOTIFY)
+ return (0);
+
+ if (vq->inuse)
+ return (0);
+
+ /* We could have also lost the interrupt the other way */
+ if (vq->last_avail_idx != vring_avail_idx(vq))
+ return (2);
+
+ if (vq->last_avail_idx != vring_used_idx(vq))
+ return (0);
+
+ /*
+ * Interrupts are enabled and we're at a point in time where we would
+ * have stalled. Let's go ahead and inject the interrupt.
+ */
+ trace_virtio_notify(vq->vdev, vq);
+ vq->vdev->isr |= 0x01;
+ virtio_notify_vector(vq->vdev, vq->vector);
+ return (1);
+}
diff --git a/hw/virtio.h b/hw/virtio.h
index d1fb8a0..ed4743c 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -153,7 +153,7 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem);
int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes);
-void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
+int virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
void virtio_save(VirtIODevice *vdev, QEMUFile *f);
@@ -226,4 +226,6 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
void virtio_queue_notify_vq(VirtQueue *vq);
void virtio_irq(VirtQueue *vq);
+int virtqueue_stalled(VirtQueue *vq);
+int virtqueue_handled(VirtQueue *vq);
#endif
diff --git a/qemu-barrier.h b/qemu-barrier.h
index b77fce2..7e11197 100644
--- a/qemu-barrier.h
+++ b/qemu-barrier.h
@@ -1,10 +1,65 @@
#ifndef __QEMU_BARRIER_H
#define __QEMU_BARRIER_H 1
-/* FIXME: arch dependant, x86 version */
-#define smp_wmb() asm volatile("" ::: "memory")
-
/* Compiler barrier */
#define barrier() asm volatile("" ::: "memory")
+#if defined(__i386__)
+
+/*
+ * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops
+ * on x86(well, a compiler barrier only). Well, at least as long as
+ * qemu doesn't do accesses to write-combining memory or non-temporal
+ * load/stores from C code.
+ */
+#define smp_wmb() barrier()
+#define smp_rmb() barrier()
+/*
+ * We use GCC builtin if it's available, as that can use
+ * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
+ * However, on i386, there seem to be known bugs as recently as 4.3.
+ * */
+#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#define smp_mb() __sync_synchronize()
+#else
+#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+#endif
+
+#elif defined(__x86_64__)
+
+#define smp_wmb() barrier()
+#define smp_rmb() barrier()
+#define smp_mb() asm volatile("mfence" ::: "memory")
+
+#elif defined(_ARCH_PPC)
+
+/*
+ * We use an eieio() for wmb() on powerpc. This assumes we don't
+ * need to order cacheable and non-cacheable stores with respect to
+ * each other
+ */
+#define smp_wmb() asm volatile("eieio" ::: "memory")
+
+#if defined(__powerpc64__)
+#define smp_rmb() asm volatile("lwsync" ::: "memory")
+#else
+#define smp_rmb() asm volatile("sync" ::: "memory")
+#endif
+
+#define smp_mb() asm volatile("sync" ::: "memory")
+
+#else
+
+/*
+ * For (host) platforms we don't have explicit barrier definitions
+ * for, we use the gcc __sync_synchronize() primitive to generate a
+ * full barrier. This should be safe on all platforms, though it may
+ * be overkill for wmb() and rmb().
+ */
+#define smp_wmb() __sync_synchronize()
+#define smp_mb() __sync_synchronize()
+#define smp_rmb() __sync_synchronize()
+
+#endif
+
#endif
diff --git a/qemu_mdb.c b/qemu_mdb.c
index 96fcef5..d6daf1f 100644
--- a/qemu_mdb.c
+++ b/qemu_mdb.c
@@ -114,6 +114,69 @@ typedef struct VRing
target_phys_addr_t used;
} VRing;
+/* Sigh More definitions ... */
+typedef enum rein_act {
+ REIN_INJECT,
+ REIN_DEADMAN,
+ REIN_RUN
+} rein_act_t;
+
+#define REIN_RING_MAX 64
+
+typedef struct rein_event {
+ rein_act_t re_act;
+ hrtime_t re_time;
+ uint64_t re_other;
+ struct timeval re_tval;
+} rein_event_t;
+
+typedef struct VirtIONet
+{
+ VirtIODevice vdev;
+ uint8_t mac[ETH_ALEN];
+ uint16_t status;
+ VirtQueue *rx_vq;
+ VirtQueue *tx_vq;
+ VirtQueue *ctrl_vq;
+ NICState *nic;
+ QEMUTimer *tx_timer;
+ QEMUBH *tx_bh;
+ uint32_t tx_timeout;
+ int32_t tx_burst;
+ int tx_waiting;
+ uint32_t has_vnet_hdr;
+ uint8_t has_ufo;
+ struct {
+ VirtQueueElement elem;
+ ssize_t len;
+ } async_tx;
+ int mergeable_rx_bufs;
+ uint8_t promisc;
+ uint8_t allmulti;
+ uint8_t alluni;
+ uint8_t nomulti;
+ uint8_t nouni;
+ uint8_t nobcast;
+ uint8_t vhost_started;
+ struct {
+ int in_use;
+ int first_multi;
+ uint8_t multi_overflow;
+ uint8_t uni_overflow;
+ uint8_t *macs;
+ } mac_table;
+ uint32_t *vlans;
+ DeviceState *qdev;
+ QEMUTimer *rein_timer;
+ uint32_t rein_timer_ticks;
+ uint8_t rein_timer_act;
+ uint32_t rein_ring_idx;
+ rein_event_t rein_ring[REIN_RING_MAX];
+ uint64_t rein_n_dead;
+ uint64_t rein_n_inject;
+ uint64_t rein_n_rerun;
+} VirtIONet;
+
/*
* NDEVICES comes from the PCIDevice structure and should be changed if this
* does ever change.
@@ -624,6 +687,58 @@ qemu_mdb_vravail(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
+static const char *reintostr[] = {
+ "INJECT",
+ "DEADMAN",
+ "RUN"
+};
+
+static int
+qemu_mdb_nic_reinject(uintptr_t addr, uint_t flags, int argc,
+ const mdb_arg_t *argv)
+{
+ VirtIONet *n;
+ uint32_t ii, end;
+ rein_event_t *rep;
+
+ if (!(flags & DCMD_ADDRSPEC))
+ return (DCMD_USAGE);
+
+ if (argc > 1)
+ return (DCMD_USAGE);
+
+ n = mdb_alloc(sizeof (VirtIONet), UM_SLEEP | UM_GC);
+
+ if (mdb_vread(n, sizeof (VirtIONet), addr) != sizeof (VirtIONet)) {
+ mdb_warn("failed to read VirtIONet");
+ return (DCMD_ERR);
+ }
+
+ if (n->rein_ring_idx == 0)
+ end = REIN_RING_MAX;
+ else
+ end = n->rein_ring_idx - 1;
+
+ mdb_printf("%-?s %-10s %s\n", "TIMESTAMP", "ACTION", "OTHER");
+ ii = n->rein_ring_idx;
+ for (;;) {
+ rep = n->rein_ring + ii;
+ if (rep->re_time == 0 && rep->re_other == 0)
+ break;
+
+ mdb_printf("%-?p %-10s ", rep->re_time, reintostr[rep->re_act]);
+ if (rep->re_other == 0)
+ mdb_printf("\n", " - ");
+ else
+ mdb_printf("%d\n", rep->re_other);
+ if (ii + 1 == end)
+ break;
+ ii = (ii + 1) % REIN_RING_MAX;
+ }
+
+ return (DCMD_OK);
+}
+
static const mdb_dcmd_t qemu_dcmds[] = {
{ "pcidev2virtio", NULL, "translate a virtio PCI device to its "
"virtio equivalent", qemu_mdb_pcidev2virtio },
@@ -633,6 +748,8 @@ static const mdb_dcmd_t qemu_dcmds[] = {
qemu_mdb_vrused },
{ "qemu_vravail", NULL, "Spit out the avail event of the vring",
qemu_mdb_vravail },
+ { "qemu_nic_reinject", NULL, "Print all of the reinject events",
+ qemu_mdb_nic_reinject },
{ NULL }
};