summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMax Bruning <max@joyent.com>2011-04-19 10:50:19 -0700
committerMax Bruning <max@joyent.com>2011-04-19 10:50:19 -0700
commitf7676d3a7b7ff8081be62e83d187c17dfc3cf15d (patch)
tree8945691c02b720805be38a0b31ceb7859fd4e4ef
parentdda119d58bb0d9faf48c70919a1079f8844ed5e2 (diff)
downloadillumos-kvm-f7676d3a7b7ff8081be62e83d187c17dfc3cf15d.tar.gz
Initial support for HVM-85
-rw-r--r--kvm.c539
-rw-r--r--kvm.h84
-rw-r--r--kvm_x86.c110
-rw-r--r--kvm_x86host.h34
4 files changed, 708 insertions, 59 deletions
diff --git a/kvm.c b/kvm.c
index 96dfdc4..fcf43d0 100644
--- a/kvm.c
+++ b/kvm.c
@@ -2716,6 +2716,8 @@ void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
shadow_x_mask = x_mask;
}
+uint64_t cpu_tsc_khz;
+extern uint64_t cpu_freq_hz;
static void kvm_timer_init(void)
{
@@ -2732,6 +2734,7 @@ static void kvm_timer_init(void)
#else
/* assume pi_clock in mhz */
/* cpu_tsc_khz = (CPU)->cpu_type_info.pi_clock * 1000;*/
+ cpu_tsc_khz = (cpu_freq_hz / 1000);
#endif /*CONFIG_SOLARIS*/
}
@@ -7467,6 +7470,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
#else
+ vmx->vnmi_blocked_time +=
+ gethrtime() - vmx->entry_time;
XXX_KVM_PROBE;
#endif /*XXX*/
}
@@ -7562,6 +7567,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
#ifdef XXX
vmx->entry_time = ktime_get();
#else
+ vmx->entry_time = gethrtime();
XXX_KVM_PROBE;
#endif /*XXX*/
}
@@ -11529,9 +11535,49 @@ static inline void native_set_debugreg(int regno, unsigned long value)
}
}
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ uint32_t quotient, remainder;
+
+ /* Don't try to replace with do_div(), this one calculates
+ * "(dividend << 32) / divisor" */
+ __asm__ ( "divl %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (0), "1" (dividend), "r" (divisor) );
+ return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+ uint64_t nsecs = 1000000000LL;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = tsc_khz * 1000LL;
+ while (tps64 > nsecs*2) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= (uint32_t)nsecs) {
+ tps32 <<= 1;
+ shift++;
+ }
+
+ hv_clock->tsc_shift = shift;
+ hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+#ifdef KVM_DEBUG
+ pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+ __func__, tsc_khz, hv_clock->tsc_shift,
+ hv_clock->tsc_to_system_mul);
+#endif /*KVM_DEBUG*/
+}
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
-#ifdef XXX
struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -11541,24 +11587,40 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
- this_tsc_khz = get_cpu_var(cpu_tsc_khz);
- if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ this_tsc_khz = cpu_tsc_khz;
+ if (vcpu->hv_clock_tsc_khz != this_tsc_khz) {
kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+#ifdef XXX
put_cpu_var(cpu_tsc_khz);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+#ifdef XXX
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
+#else
+ /*
+ * may need to mask interrupts for local_irq_save, and unmask
+ * for local_irq_restore. cli()/sti() might be done...
+ */
+ XXX_KVM_PROBE;
+#endif /*XXX*/
kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
- ktime_get_ts(&ts);
+ gethrestime(&ts);
+#ifdef XXX
monotonic_to_bootbased(&ts);
local_irq_restore(flags);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
/* With all the info we got, fill in the values */
vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+ (NSEC_PER_SEC * (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
/*
* The interface expects us to write an even number signaling that the
@@ -11573,10 +11635,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
sizeof(vcpu->hv_clock));
- mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
-#else
- XXX_KVM_PROBE;
-#endif /*XXX*/
+ mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT);
}
/*
@@ -11835,6 +11894,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
#else
+ mutex_enter(&cpu_lock);
+ cyclic_remove(apic->lapic_timer.kvm_cyclic_id);
+ mutex_exit(&cpu_lock);
XXX_KVM_PROBE;
#endif /*XXX*/
@@ -11865,6 +11927,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
#else
+ apic->lapic_timer.pending = 0;
XXX_KVM_PROBE;
#endif /*XXX*/
if (kvm_vcpu_is_bsp(vcpu))
@@ -13200,6 +13263,451 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
#endif /*XXX*/
}
+static int64_t __kpit_elapsed(struct kvm *kvm)
+{
+ int64_t elapsed;
+ hrtime_t remaining;
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ if (!ps->pit_timer.period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+#ifdef XXX
+ remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
+ elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+#else
+ remaining = 0; /* XXX assumes timer always expires */
+ elapsed = ps->pit_timer.period;
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+ elapsed = mod_64(elapsed, ps->pit_timer.period);
+
+ return elapsed;
+}
+
+static int64_t kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(kvm);
+
+ return gethrtime() - c->count_load_time;
+}
+
+static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+ union {
+ uint64_t ll;
+ struct {
+ uint32_t low, high;
+ } l;
+ } u, res;
+ uint64_t rl, rh;
+
+ u.ll = a;
+ rl = (uint64_t)u.l.low * (uint64_t)b;
+ rh = (uint64_t)u.l.high * (uint64_t)b;
+ rh += (rl >> 32);
+ res.l.high = rh/c;
+ res.l.low = ((mod_64(rh, c) << 32) + (rl & 0xffffffff))/ c;
+ return res.ll;
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ int64_t d, t;
+ int counter;
+
+ ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+ t = kpit_elapsed(kvm, c, channel);
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ int64_t d, t;
+ int out;
+
+ ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+ t = kpit_elapsed(kvm, c, channel);
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(kvm, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(kvm, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+static struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+#ifdef XXX_KVM_DOESNTCOMPILE
+ return container_of(dev, struct kvm_pit, dev);
+#else
+ return (struct kvm_pit *)(((caddr_t)dev) -
+ offsetof(struct kvm_pit, dev));
+#endif /*XXX_KVM_DOESNTCOMPILE*/
+}
+
+static int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
+ s = &pit_state->channels[addr];
+
+ mutex_enter(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_exit(&pit_state->lock);
+ return 0;
+}
+
+static void destroy_pit_timer(struct kvm_timer *pt)
+{
+#ifdef XXX
+ pr_debug("pit: " "execute del timer!\n");
+ hrtimer_cancel_p(&pt->timer);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+}
+
+extern void kvm_timer_fn(void *arg);
+
+
+static int kpit_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_kpit_state *ps = (struct kvm_kpit_state *)(((caddr_t)ktimer)
+ - offsetof(struct kvm_kpit_state,
+ pit_timer));
+ return ps->is_periodic;
+}
+
+static struct kvm_timer_ops kpit_ops = {
+ .is_periodic = kpit_is_periodic,
+};
+
+static void create_pit_timer(struct kvm_kpit_state *ps, uint32_t val, int is_period)
+{
+ struct kvm_timer *pt = &ps->pit_timer;
+ int64_t interval;
+
+ interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+#ifdef KVM_DEBUG
+ cmn_err(CMN_NOTE, "pit: create pit timer, interval is %llu nsec\n", interval);
+#endif
+
+ mutex_enter(&cpu_lock);
+ /* TODO The new value only affected after the retriggered */
+ cyclic_remove(pt->kvm_cyclic_id);
+ pt->period = interval;
+ ps->is_periodic = is_period;
+
+ pt->kvm_cyc_handler.cyh_func = kvm_timer_fn;
+#ifdef XXX
+ hrtimer_data_pointer(&pt->timer);
+#else
+ XXX_KVM_PROBE;
+#endif
+ pt->t_ops = &kpit_ops;
+ pt->kvm = ps->pit->kvm;
+ pt->vcpu = pt->kvm->bsp_vcpu;
+
+ pt->pending = 0; /*XXX need protection?*/
+ ps->irq_ack = 1;
+
+ cyclic_add(&pt->kvm_cyc_handler, &pt->kvm_cyc_when);
+ mutex_exit(&cpu_lock);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, uint32_t val)
+{
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ ASSERT(mutex_owned(&ps->lock));
+
+#ifdef KVM_DEBUG
+ cmn_err(CE_NOTE, "pit: load_count val is %d, channel is %d\n", val, channel);
+#endif
+
+ /*
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count = val;
+
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = gethrtime();
+ return;
+ }
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 0:
+ case 1:
+ /* FIXME: enhance mode 4 precision */
+ case 4:
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+ create_pit_timer(ps, val, 0);
+ }
+ break;
+ case 2:
+ case 3:
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+ create_pit_timer(ps, val, 1);
+ }
+ break;
+ default:
+ destroy_pit_timer(&ps->pit_timer);
+ }
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ uint32_t val = *(uint32_t *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_enter(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("pit: " "write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(kvm, channel);
+ if (!(val & 0x10))
+ pit_latch_status(kvm, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(kvm, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(kvm, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(kvm, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_exit(&pit_state->lock);
+ return 0;
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ mutex_enter(&pit->pit_state.lock);
+ pit->pit_state.flags = 0;
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit->kvm, i, 0);
+ }
+ mutex_exit(&pit->pit_state.lock);
+
+ pit->pit_state.pit_timer.pending = 0; /*XXX need protection?*/
+ pit->pit_state.irq_ack = 1;
+}
+
/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, uint32_t flags)
{
@@ -13233,27 +13741,34 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, uint32_t flags)
#ifdef XXX
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+#ifdef XXX
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
#else
XXX_KVM_PROBE;
#endif /*XXX*/
+ pit_state->pit_timer.reinject = 1;
mutex_exit(&pit->pit_state.lock);
-#ifdef XXX
kvm_pit_reset(pit);
-
+#ifdef XXX
pit->mask_notifier.func = pit_mask_notifer;
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
if (ret < 0)
goto fail;
+#ifdef XXX
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
diff --git a/kvm.h b/kvm.h
index 2eeed1b..a59eac7 100644
--- a/kvm.h
+++ b/kvm.h
@@ -7,7 +7,10 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/sdt.h>
-
+#undef _ASM /* cyclic.h expects this not defined */
+#include <sys/cyclic.h>
+#define _ASM
+#include <sys/atomic.h>
#include "kvm_types.h"
#define XXX_KVM_PROBE DTRACE_PROBE2(kvm__xxx, \
@@ -296,38 +299,15 @@ struct fxsave {
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
#ifdef _KERNEL
#include "kvm_emulate.h"
-/*
- * These structs MUST NOT be changed.
- * They are the ABI between hypervisor and guest OS.
- * Both Xen and KVM are using this.
- *
- * pvclock_vcpu_time_info holds the system time and the tsc timestamp
- * of the last update. So the guest can use the tsc delta to get a
- * more precise system time. There is one per virtual cpu.
- *
- * pvclock_wall_clock references the point in time when the system
- * time was zero (usually boot time), thus the guest calculates the
- * current wall clock by adding the system time.
- *
- * Protocol for the "version" fields is: hypervisor raises it (making
- * it uneven) before it starts updating the fields and raises it again
- * (making it even) when it is done. Thus the guest can make sure the
- * time values it got are consistent by checking the version before
- * and after reading them.
- */
-
-struct pvclock_vcpu_time_info {
- uint32_t version;
- uint32_t pad0;
- uint64_t tsc_timestamp;
- uint64_t system_time;
- uint32_t tsc_to_system_mul;
- char tsc_shift;
- unsigned char pad[3];
-} __attribute__((__packed__)); /* 32 bytes */
#endif /*_KERNEL*/
@@ -357,12 +337,31 @@ struct pvclock_vcpu_time_info {
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
#ifdef _KERNEL
+
+struct kvm_timer {
+#ifdef XXX
+ struct hrtimer timer;
+#else
+ cyclic_id_t kvm_cyclic_id;
+ cyc_handler_t kvm_cyc_handler;
+ cyc_time_t kvm_cyc_when;
+#endif /*XXX*/
+ int64_t period; /* unit: ns */
+ int32_t pending; /* accumulated triggered timers */
+ int reinject;
+ struct kvm_timer_ops *t_ops;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+};
+
+struct kvm_timer_ops {
+ int (*is_periodic)(struct kvm_timer *);
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
-#ifdef XXX
struct kvm_timer lapic_timer;
-#endif /*XXX*/
uint32_t divide_count;
struct kvm_vcpu *vcpu;
int irr_pending;
@@ -1798,6 +1797,9 @@ struct kvm_tpr_acl_ioc {
/* Available with KVM_CAP_VAPIC */
#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
+#define APIC_BUS_CYCLE_NS 1
+#define NSEC_PER_MSEC 1000000L
+#define NSEC_PER_SEC 1000000000L
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
@@ -1991,17 +1993,13 @@ struct kvm_kpit_channel_state {
uint8_t mode;
uint8_t bcd; /* not supported */
uint8_t gate; /* timer start */
-#ifdef XXX
- ktime_t count_load_time;
-#endif /*XXX*/
+ hrtime_t count_load_time;
};
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
uint32_t flags;
-#ifdef XXX
struct kvm_timer pit_timer;
-#endif /*XXX*/
int is_periodic;
uint32_t speaker_data_on;
kmutex_t lock;
@@ -2023,6 +2021,18 @@ struct kvm_pit {
#endif /*XXX*/
};
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
#define page_to_pfn(page) (page->p_pagenum)
#define set_page_private(page, v) ((page)->p_private = (v))
diff --git a/kvm_x86.c b/kvm_x86.c
index 7fe7436..4e9fc70 100644
--- a/kvm_x86.c
+++ b/kvm_x86.c
@@ -692,6 +692,11 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK /* LVTERR */
};
+static int apic_lvtt_period(struct kvm_lapic *apic)
+{
+ return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
static void start_apic_timer(struct kvm_lapic *apic)
{
#ifdef XXX
@@ -718,6 +723,26 @@ static void start_apic_timer(struct kvm_lapic *apic)
HRTIMER_MODE_ABS);
#else
+ hrtime_t now = gethrtime();
+
+ apic->lapic_timer.period = (uint64_t)apic_get_reg(apic, APIC_TMICT) *
+ APIC_BUS_CYCLE_NS * apic->divide_count;
+
+ if (!apic->lapic_timer.period)
+ return;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+ apic->lapic_timer.period = NSEC_PER_MSEC/2;
+ }
+ mutex_enter(&cpu_lock);
+ apic->lapic_timer.kvm_cyclic_id = cyclic_add(&apic->lapic_timer.kvm_cyc_handler,
+ &apic->lapic_timer.kvm_cyc_when);
+ mutex_exit(&cpu_lock);
XXX_KVM_PROBE;
#endif /*XXX*/
}
@@ -1238,11 +1263,8 @@ int apic_reg_write(struct kvm_lapic *apic, uint32_t reg, uint32_t val)
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
}
-#ifdef XXX
- atomic_set(&apic->lapic_timer.pending, 0);
-#else
- XXX_KVM_PROBE;
-#endif
+ /* XXX pending needs protection ?*/
+ apic->lapic_timer.pending = 0;
}
break;
}
@@ -1278,6 +1300,9 @@ int apic_reg_write(struct kvm_lapic *apic, uint32_t reg, uint32_t val)
#ifdef XXX
hrtimer_cancel(&apic->lapic_timer.timer);
#else
+ mutex_enter(&cpu_lock);
+ cyclic_remove(apic->lapic_timer.kvm_cyclic_id);
+ mutex_exit(&cpu_lock);
XXX_KVM_PROBE;
#endif
apic_set_reg(apic, APIC_TMICT, val);
@@ -1343,6 +1368,70 @@ static const struct kvm_io_device_ops apic_mmio_ops = {
.write = apic_mmio_write,
};
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+ int restart_timer = 0;
+#ifdef XXX
+ wait_queue_head_t *q = &vcpu->wq;
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially loosing timer events in the !reinject
+ * case anyway.
+ */
+ /* XXX may need protectionn on pending */
+ if (ktimer->reinject || !ktimer->pending) {
+ atomic_add_32(&ktimer->pending, 1);
+ /* FIXME: this code should not know anything about vcpus */
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ }
+
+#ifdef XXX
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+#ifdef XXX
+ kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+#else
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+ restart_timer = 1;
+ }
+
+ return restart_timer;
+}
+
+void kvm_timer_fn(void *arg)
+{
+ struct kvm_timer *ktimer = (struct kvm_timer *)arg;
+ int restart_timer;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = ktimer->vcpu;
+ if (!vcpu)
+ return;
+
+ restart_timer = __kvm_timer_fn(vcpu, ktimer);
+}
+
+static int lapic_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_lapic *apic = (struct kvm_lapic *)((caddr_t)ktimer
+ - offsetof(struct kvm_lapic, lapic_timer));
+ return apic_lvtt_period(apic);
+}
+
+static struct kvm_timer_ops lapic_timer_ops = {
+ .is_periodic = lapic_is_periodic,
+};
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -1369,12 +1458,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
apic->lapic_timer.timer.function = kvm_timer_fn;
+#else
+ apic->lapic_timer.kvm_cyc_handler.cyh_func = kvm_timer_fn;
+ apic->lapic_timer.kvm_cyc_handler.cyh_arg = &apic->lapic_timer;
+ apic->lapic_timer.kvm_cyc_handler.cyh_level = CY_HIGH_LEVEL;
+ XXX_KVM_PROBE;
+#endif /*XXX*/
+
apic->lapic_timer.t_ops = &lapic_timer_ops;
apic->lapic_timer.kvm = vcpu->kvm;
apic->lapic_timer.vcpu = vcpu;
-#else
- XXX_KVM_PROBE;
-#endif
+
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
diff --git a/kvm_x86host.h b/kvm_x86host.h
index d40f573..a9dd705 100644
--- a/kvm_x86host.h
+++ b/kvm_x86host.h
@@ -357,6 +357,36 @@ struct i387_fxsave_struct {
} __attribute__((aligned(16)));
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time. There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done. Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+ uint32_t version;
+ uint32_t pad0;
+ uint64_t tsc_timestamp;
+ uint64_t system_time;
+ uint32_t tsc_to_system_mul;
+ char tsc_shift;
+ unsigned char pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
struct kvm_vcpu_arch {
uint64_t host_tsc;
/*
@@ -437,9 +467,9 @@ struct kvm_vcpu_arch {
struct x86_emulate_ctxt emulate_ctxt;
gpa_t time;
-#ifdef XXX
+
struct pvclock_vcpu_time_info hv_clock;
-#endif /*XXX*/
+
unsigned int hv_clock_tsc_khz;
unsigned int time_offset;
page_t *time_page;