Initial support for HVM-85

author: Max Bruning <max@joyent.com> 2011-04-19 10:50:19 -0700
committer: Max Bruning <max@joyent.com> 2011-04-19 10:50:19 -0700
commit: f7676d3a7b7ff8081be62e83d187c17dfc3cf15d (patch)
tree: 8945691c02b720805be38a0b31ceb7859fd4e4ef
parent: dda119d58bb0d9faf48c70919a1079f8844ed5e2 (diff)
download: illumos-kvm-f7676d3a7b7ff8081be62e83d187c17dfc3cf15d.tar.gz
4 files changed, 708 insertions, 59 deletions
diff --git a/kvm.c b/kvm.c
index 96dfdc4..fcf43d0 100644
--- a/kvm.c
+++ b/kvm.c
@@ -2716,6 +2716,8 @@ void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
 	shadow_x_mask = x_mask;
 }
 
+uint64_t cpu_tsc_khz;
+extern uint64_t cpu_freq_hz;
 
 static void kvm_timer_init(void)
 {
@@ -2732,6 +2734,7 @@ static void kvm_timer_init(void)
 #else
 	/* assume pi_clock in mhz */
 	/* cpu_tsc_khz = (CPU)->cpu_type_info.pi_clock * 1000;*/
+	cpu_tsc_khz = (cpu_freq_hz / 1000);
 #endif /*CONFIG_SOLARIS*/
 }
 
@@ -7467,6 +7470,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 #else
+		vmx->vnmi_blocked_time +=
+			gethrtime() -  vmx->entry_time;
 		XXX_KVM_PROBE;
 #endif /*XXX*/
 	}
@@ -7562,6 +7567,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #ifdef XXX
 		vmx->entry_time = ktime_get();
 #else
+		vmx->entry_time = gethrtime();
 		XXX_KVM_PROBE;
 #endif /*XXX*/
 	}
@@ -11529,9 +11535,49 @@ static inline void native_set_debugreg(int regno, unsigned long value)
 	}
 }
 
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+	uint32_t quotient, remainder;
+
+	/* Don't try to replace with do_div(), this one calculates
+	 * "(dividend << 32) / divisor" */
+	__asm__ ( "divl %4"
+		  : "=a" (quotient), "=d" (remainder)
+		  : "0" (0), "1" (dividend), "r" (divisor) );
+	return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+	uint64_t nsecs = 1000000000LL;
+	int32_t  shift = 0;
+	uint64_t tps64;
+	uint32_t tps32;
+
+	tps64 = tsc_khz * 1000LL;
+	while (tps64 > nsecs*2) {
+		tps64 >>= 1;
+		shift--;
+	}
+
+	tps32 = (uint32_t)tps64;
+	while (tps32 <= (uint32_t)nsecs) {
+		tps32 <<= 1;
+		shift++;
+	}
+
+	hv_clock->tsc_shift = shift;
+	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+#ifdef KVM_DEBUG
+	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+		 __func__, tsc_khz, hv_clock->tsc_shift,
+		 hv_clock->tsc_to_system_mul);
+#endif /*KVM_DEBUG*/
+}
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
-#ifdef XXX
 	struct timespec ts;
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -11541,24 +11587,40 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
-	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+	this_tsc_khz = cpu_tsc_khz;
+	if (vcpu->hv_clock_tsc_khz != this_tsc_khz) {
 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 		vcpu->hv_clock_tsc_khz = this_tsc_khz;
 	}
+#ifdef XXX
 	put_cpu_var(cpu_tsc_khz);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
 
+#ifdef XXX
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
+#else
+	/*
+	 * may need to mask interrupts for local_irq_save, and unmask
+	 * for local_irq_restore.  cli()/sti() might be done...
+	 */
+	XXX_KVM_PROBE;
+#endif /*XXX*/
 	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-	ktime_get_ts(&ts);
+	gethrestime(&ts);
+#ifdef XXX
 	monotonic_to_bootbased(&ts);
 	local_irq_restore(flags);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
 
 	/* With all the info we got, fill in the values */
 
 	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+				     (NSEC_PER_SEC * (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
 	/*
 	 * The interface expects us to write an even number signaling that the
@@ -11573,10 +11635,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	       sizeof(vcpu->hv_clock));
 
 
-	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
-#else
-	XXX_KVM_PROBE;
-#endif /*XXX*/
+	mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT);
 }
 
 /*
@@ -11835,6 +11894,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 #else
+	mutex_enter(&cpu_lock);
+	cyclic_remove(apic->lapic_timer.kvm_cyclic_id);
+	mutex_exit(&cpu_lock);
 	XXX_KVM_PROBE;
 #endif /*XXX*/
 
@@ -11865,6 +11927,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
 #else
+	apic->lapic_timer.pending = 0;
 	XXX_KVM_PROBE;
 #endif /*XXX*/
 	if (kvm_vcpu_is_bsp(vcpu))
@@ -13200,6 +13263,451 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 #endif /*XXX*/
 }
 
+static int64_t __kpit_elapsed(struct kvm *kvm)
+{
+	int64_t elapsed;
+	hrtime_t remaining;
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+	if (!ps->pit_timer.period)
+		return 0;
+
+	/*
+	 * The Counter does not stop when it reaches zero. In
+	 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+	 * the highest count, either FFFF hex for binary counting
+	 * or 9999 for BCD counting, and continues counting.
+	 * Modes 2 and 3 are periodic; the Counter reloads
+	 * itself with the initial count and continues counting
+	 * from there.
+	 */
+#ifdef XXX
+	remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
+	elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+#else
+	remaining = 0;  /* XXX assumes timer always expires */
+	elapsed = ps->pit_timer.period;
+	XXX_KVM_PROBE;
+#endif /*XXX*/
+	elapsed = mod_64(elapsed, ps->pit_timer.period);
+
+	return elapsed;
+}
+
+static int64_t kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+			int channel)
+{
+	if (channel == 0)
+		return __kpit_elapsed(kvm);
+
+	return gethrtime() - c->count_load_time;
+}
+
+static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+	union {
+		uint64_t ll;
+		struct {
+			uint32_t low, high;
+		} l;
+	} u, res;
+	uint64_t rl, rh;
+
+	u.ll = a;
+	rl = (uint64_t)u.l.low * (uint64_t)b;
+	rh = (uint64_t)u.l.high * (uint64_t)b;
+	rh += (rl >> 32);
+	res.l.high = rh/c;
+	res.l.low = ((mod_64(rh, c) << 32) + (rl & 0xffffffff))/ c;
+	return res.ll;
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+	int64_t d, t;
+	int counter;
+
+	ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+	t = kpit_elapsed(kvm, c, channel);
+	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+	switch (c->mode) {
+	case 0:
+	case 1:
+	case 4:
+	case 5:
+		counter = (c->count - d) & 0xffff;
+		break;
+	case 3:
+		/* XXX: may be incorrect for odd counts */
+		counter = c->count - (mod_64((2 * d), c->count));
+		break;
+	default:
+		counter = c->count - mod_64(d, c->count);
+		break;
+	}
+	return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+	int64_t d, t;
+	int out;
+
+	ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+	t = kpit_elapsed(kvm, c, channel);
+	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+	switch (c->mode) {
+	default:
+	case 0:
+		out = (d >= c->count);
+		break;
+	case 1:
+		out = (d < c->count);
+		break;
+	case 2:
+		out = ((mod_64(d, c->count) == 0) && (d != 0));
+		break;
+	case 3:
+		out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+		break;
+	case 4:
+	case 5:
+		out = (d == c->count);
+		break;
+	}
+
+	return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+
+	ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+	if (!c->count_latched) {
+		c->latched_count = pit_get_count(kvm, channel);
+		c->count_latched = c->rw_mode;
+	}
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+
+	ASSERT(mutex_owned(&kvm->arch.vpit->pit_state.lock));
+
+	if (!c->status_latched) {
+		/* TODO: Return NULL COUNT (bit 6). */
+		c->status = ((pit_get_out(kvm, channel) << 7) |
+				(c->rw_mode << 4) |
+				(c->mode << 1) |
+				c->bcd);
+		c->status_latched = 1;
+	}
+}
+
+static struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+#ifdef XXX_KVM_DOESNTCOMPILE
+	return container_of(dev, struct kvm_pit, dev);
+#else
+	return (struct kvm_pit *)(((caddr_t)dev) -
+	    offsetof(struct kvm_pit, dev));
+#endif /*XXX_KVM_DOESNTCOMPILE*/
+}
+
+static int pit_in_range(gpa_t addr)
+{
+	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_read(struct kvm_io_device *this,
+			   gpa_t addr, int len, void *data)
+{
+	struct kvm_pit *pit = dev_to_pit(this);
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	int ret, count;
+	struct kvm_kpit_channel_state *s;
+	if (!pit_in_range(addr))
+		return -EOPNOTSUPP;
+
+	addr &= KVM_PIT_CHANNEL_MASK;
+	if (addr == 3)
+		return 0;
+
+	s = &pit_state->channels[addr];
+
+	mutex_enter(&pit_state->lock);
+
+	if (s->status_latched) {
+		s->status_latched = 0;
+		ret = s->status;
+	} else if (s->count_latched) {
+		switch (s->count_latched) {
+		default:
+		case RW_STATE_LSB:
+			ret = s->latched_count & 0xff;
+			s->count_latched = 0;
+			break;
+		case RW_STATE_MSB:
+			ret = s->latched_count >> 8;
+			s->count_latched = 0;
+			break;
+		case RW_STATE_WORD0:
+			ret = s->latched_count & 0xff;
+			s->count_latched = RW_STATE_MSB;
+			break;
+		}
+	} else {
+		switch (s->read_state) {
+		default:
+		case RW_STATE_LSB:
+			count = pit_get_count(kvm, addr);
+			ret = count & 0xff;
+			break;
+		case RW_STATE_MSB:
+			count = pit_get_count(kvm, addr);
+			ret = (count >> 8) & 0xff;
+			break;
+		case RW_STATE_WORD0:
+			count = pit_get_count(kvm, addr);
+			ret = count & 0xff;
+			s->read_state = RW_STATE_WORD1;
+			break;
+		case RW_STATE_WORD1:
+			count = pit_get_count(kvm, addr);
+			ret = (count >> 8) & 0xff;
+			s->read_state = RW_STATE_WORD0;
+			break;
+		}
+	}
+
+	if (len > sizeof(ret))
+		len = sizeof(ret);
+	memcpy(data, (char *)&ret, len);
+
+	mutex_exit(&pit_state->lock);
+	return 0;
+}
+
+static void destroy_pit_timer(struct kvm_timer *pt)
+{
+#ifdef XXX
+	pr_debug("pit: " "execute del timer!\n");
+	hrtimer_cancel_p(&pt->timer);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
+}
+
+extern void kvm_timer_fn(void *arg);
+
+
+static int kpit_is_periodic(struct kvm_timer *ktimer)
+{
+	struct kvm_kpit_state *ps = (struct kvm_kpit_state *)(((caddr_t)ktimer)
+							 - offsetof(struct kvm_kpit_state,
+								    pit_timer));
+	return ps->is_periodic;
+}
+
+static struct kvm_timer_ops kpit_ops = {
+	.is_periodic = kpit_is_periodic,
+};
+
+static void create_pit_timer(struct kvm_kpit_state *ps, uint32_t val, int is_period)
+{
+	struct kvm_timer *pt = &ps->pit_timer;
+	int64_t interval;
+
+	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+#ifdef KVM_DEBUG
+	cmn_err(CMN_NOTE, "pit: create pit timer, interval is %llu nsec\n", interval);
+#endif
+
+	mutex_enter(&cpu_lock);
+	/* TODO The new value only affected after the retriggered */
+	cyclic_remove(pt->kvm_cyclic_id);
+	pt->period = interval;
+	ps->is_periodic = is_period;
+
+	pt->kvm_cyc_handler.cyh_func = kvm_timer_fn;
+#ifdef XXX
+	hrtimer_data_pointer(&pt->timer);
+#else
+	XXX_KVM_PROBE;
+#endif
+	pt->t_ops = &kpit_ops;
+	pt->kvm = ps->pit->kvm;
+	pt->vcpu = pt->kvm->bsp_vcpu;
+
+	pt->pending = 0;  /*XXX need protection?*/
+	ps->irq_ack = 1;
+
+	cyclic_add(&pt->kvm_cyc_handler, &pt->kvm_cyc_when);
+	mutex_exit(&cpu_lock);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, uint32_t val)
+{
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+	ASSERT(mutex_owned(&ps->lock));
+
+#ifdef KVM_DEBUG
+	cmn_err(CE_NOTE, "pit: load_count val is %d, channel is %d\n", val, channel);
+#endif
+
+	/*
+	 * The largest possible initial count is 0; this is equivalent
+	 * to 216 for binary counting and 104 for BCD counting.
+	 */
+	if (val == 0)
+		val = 0x10000;
+
+	ps->channels[channel].count = val;
+
+	if (channel != 0) {
+		ps->channels[channel].count_load_time = gethrtime();
+		return;
+	}
+
+	/* Two types of timer
+	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
+	switch (ps->channels[0].mode) {
+	case 0:
+	case 1:
+        /* FIXME: enhance mode 4 precision */
+	case 4:
+		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+			create_pit_timer(ps, val, 0);
+		}
+		break;
+	case 2:
+	case 3:
+		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+			create_pit_timer(ps, val, 1);
+		}
+		break;
+	default:
+		destroy_pit_timer(&ps->pit_timer);
+	}
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+			    gpa_t addr, int len, const void *data)
+{
+	struct kvm_pit *pit = dev_to_pit(this);
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	int channel, access;
+	struct kvm_kpit_channel_state *s;
+	uint32_t val = *(uint32_t *) data;
+	if (!pit_in_range(addr))
+		return -EOPNOTSUPP;
+
+	val  &= 0xff;
+	addr &= KVM_PIT_CHANNEL_MASK;
+
+	mutex_enter(&pit_state->lock);
+
+	if (val != 0)
+		pr_debug("pit: " "write addr is 0x%x, len is %d, val is 0x%x\n",
+			 (unsigned int)addr, len, val);
+
+	if (addr == 3) {
+		channel = val >> 6;
+		if (channel == 3) {
+			/* Read-Back Command. */
+			for (channel = 0; channel < 3; channel++) {
+				s = &pit_state->channels[channel];
+				if (val & (2 << channel)) {
+					if (!(val & 0x20))
+						pit_latch_count(kvm, channel);
+					if (!(val & 0x10))
+						pit_latch_status(kvm, channel);
+				}
+			}
+		} else {
+			/* Select Counter <channel>. */
+			s = &pit_state->channels[channel];
+			access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+			if (access == 0) {
+				pit_latch_count(kvm, channel);
+			} else {
+				s->rw_mode = access;
+				s->read_state = access;
+				s->write_state = access;
+				s->mode = (val >> 1) & 7;
+				if (s->mode > 5)
+					s->mode -= 4;
+				s->bcd = val & 1;
+			}
+		}
+	} else {
+		/* Write Count. */
+		s = &pit_state->channels[addr];
+		switch (s->write_state) {
+		default:
+		case RW_STATE_LSB:
+			pit_load_count(kvm, addr, val);
+			break;
+		case RW_STATE_MSB:
+			pit_load_count(kvm, addr, val << 8);
+			break;
+		case RW_STATE_WORD0:
+			s->write_latch = val;
+			s->write_state = RW_STATE_WORD1;
+			break;
+		case RW_STATE_WORD1:
+			pit_load_count(kvm, addr, s->write_latch | (val << 8));
+			s->write_state = RW_STATE_WORD0;
+			break;
+		}
+	}
+
+	mutex_exit(&pit_state->lock);
+	return 0;
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+	.read     = pit_ioport_read,
+	.write    = pit_ioport_write,
+};
+
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+	int i;
+	struct kvm_kpit_channel_state *c;
+
+	mutex_enter(&pit->pit_state.lock);
+	pit->pit_state.flags = 0;
+	for (i = 0; i < 3; i++) {
+		c = &pit->pit_state.channels[i];
+		c->mode = 0xff;
+		c->gate = (i != 2);
+		pit_load_count(pit->kvm, i, 0);
+	}
+	mutex_exit(&pit->pit_state.lock);
+
+	pit->pit_state.pit_timer.pending =  0; /*XXX need protection?*/
+	pit->pit_state.irq_ack = 1;
+}
+
 /* Caller must hold slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, uint32_t flags)
 {
@@ -13233,27 +13741,34 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, uint32_t flags)
 #ifdef XXX
 	hrtimer_init(&pit_state->pit_timer.timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+#ifdef XXX
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-	pit_state->pit_timer.reinject = true;
 #else
 	XXX_KVM_PROBE;
 #endif /*XXX*/
+	pit_state->pit_timer.reinject = 1;
 
 	mutex_exit(&pit->pit_state.lock);
 
-#ifdef XXX
 	kvm_pit_reset(pit);
-
+#ifdef XXX
 	pit->mask_notifier.func = pit_mask_notifer;
 	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
 	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
 	if (ret < 0)
 		goto fail;
 
+#ifdef XXX
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
 		ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
diff --git a/kvm.h b/kvm.h
index 2eeed1b..a59eac7 100644
--- a/kvm.h
+++ b/kvm.h
@@ -7,7 +7,10 @@
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/sdt.h>
-
+#undef _ASM  /* cyclic.h expects this not defined */
+#include <sys/cyclic.h>
+#define _ASM
+#include <sys/atomic.h>
 #include "kvm_types.h"
 
 #define XXX_KVM_PROBE DTRACE_PROBE2(kvm__xxx, \
@@ -296,38 +299,15 @@ struct fxsave {
 	 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
 
 
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
 #ifdef _KERNEL
 #include "kvm_emulate.h"
 
-/*
- * These structs MUST NOT be changed.
- * They are the ABI between hypervisor and guest OS.
- * Both Xen and KVM are using this.
- *
- * pvclock_vcpu_time_info holds the system time and the tsc timestamp
- * of the last update. So the guest can use the tsc delta to get a
- * more precise system time.  There is one per virtual cpu.
- *
- * pvclock_wall_clock references the point in time when the system
- * time was zero (usually boot time), thus the guest calculates the
- * current wall clock by adding the system time.
- *
- * Protocol for the "version" fields is: hypervisor raises it (making
- * it uneven) before it starts updating the fields and raises it again
- * (making it even) when it is done.  Thus the guest can make sure the
- * time values it got are consistent by checking the version before
- * and after reading them.
- */
-
-struct pvclock_vcpu_time_info {
-	uint32_t   version;
-	uint32_t   pad0;
-	uint64_t   tsc_timestamp;
-	uint64_t   system_time;
-	uint32_t   tsc_to_system_mul;
-	char    tsc_shift;
-	unsigned char    pad[3];
-} __attribute__((__packed__)); /* 32 bytes */
 
 #endif /*_KERNEL*/
 
@@ -357,12 +337,31 @@ struct pvclock_vcpu_time_info {
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
 #ifdef _KERNEL
+
+struct kvm_timer {
+#ifdef XXX
+	struct hrtimer timer;
+#else
+	cyclic_id_t kvm_cyclic_id;
+	cyc_handler_t kvm_cyc_handler;
+	cyc_time_t kvm_cyc_when;	
+#endif /*XXX*/
+	int64_t period; 				/* unit: ns */
+	int32_t pending;			/* accumulated triggered timers */
+	int reinject;
+	struct kvm_timer_ops *t_ops;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_timer_ops {
+        int (*is_periodic)(struct kvm_timer *);
+};
+
 struct kvm_lapic {
 	unsigned long base_address;
 	struct kvm_io_device dev;
-#ifdef XXX
 	struct kvm_timer lapic_timer;
-#endif /*XXX*/
 	uint32_t divide_count;
 	struct kvm_vcpu *vcpu;
 	int irr_pending;
@@ -1798,6 +1797,9 @@ struct kvm_tpr_acl_ioc {
 /* Available with KVM_CAP_VAPIC */
 #define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
 
+#define APIC_BUS_CYCLE_NS 1
+#define NSEC_PER_MSEC 1000000L
+#define NSEC_PER_SEC 1000000000L
 
 /* for kvm_memory_region::flags */
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
@@ -1991,17 +1993,13 @@ struct kvm_kpit_channel_state {
 	uint8_t mode;
 	uint8_t bcd; /* not supported */
 	uint8_t gate; /* timer start */
-#ifdef XXX
-	ktime_t count_load_time;
-#endif /*XXX*/
+	hrtime_t count_load_time;
 };
 
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
 	uint32_t flags;
-#ifdef XXX
 	struct kvm_timer pit_timer;
-#endif /*XXX*/
 	int is_periodic;
 	uint32_t    speaker_data_on;
 	kmutex_t lock;
@@ -2023,6 +2021,18 @@ struct kvm_pit {
 #endif /*XXX*/
 };
 
+#define KVM_PIT_BASE_ADDRESS	    0x40
+#define KVM_SPEAKER_BASE_ADDRESS    0x61
+#define KVM_PIT_MEM_LENGTH	    4
+#define KVM_PIT_FREQ		    1193181
+#define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
+#define KVM_PIT_CHANNEL_MASK	    0x3
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
 #define page_to_pfn(page) (page->p_pagenum)
 #define set_page_private(page, v)	((page)->p_private = (v))
 
diff --git a/kvm_x86.c b/kvm_x86.c
index 7fe7436..4e9fc70 100644
--- a/kvm_x86.c
+++ b/kvm_x86.c
@@ -692,6 +692,11 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 	LVT_MASK		/* LVTERR */
 };
 
+static int apic_lvtt_period(struct kvm_lapic *apic)
+{
+	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 #ifdef XXX
@@ -718,6 +723,26 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		      HRTIMER_MODE_ABS);
 
 #else
+	hrtime_t now = gethrtime();
+
+	apic->lapic_timer.period = (uint64_t)apic_get_reg(apic, APIC_TMICT) *
+		    APIC_BUS_CYCLE_NS * apic->divide_count;
+
+	if (!apic->lapic_timer.period)
+		return;
+	/*
+	 * Do not allow the guest to program periodic timers with small
+	 * interval, since the hrtimers are not throttled by the host
+	 * scheduler.
+	 */
+	if (apic_lvtt_period(apic)) {
+		if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+			apic->lapic_timer.period = NSEC_PER_MSEC/2;
+	}
+	mutex_enter(&cpu_lock);
+	apic->lapic_timer.kvm_cyclic_id = cyclic_add(&apic->lapic_timer.kvm_cyc_handler,
+						     &apic->lapic_timer.kvm_cyc_when);
+	mutex_exit(&cpu_lock);
 	XXX_KVM_PROBE;
 #endif /*XXX*/
 }
@@ -1238,11 +1263,8 @@ int apic_reg_write(struct kvm_lapic *apic, uint32_t reg, uint32_t val)
 				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
 					     lvt_val | APIC_LVT_MASKED);
 			}
-#ifdef XXX
-			atomic_set(&apic->lapic_timer.pending, 0);
-#else
-			XXX_KVM_PROBE;
-#endif
+			/* XXX pending needs protection ?*/
+			apic->lapic_timer.pending = 0;
 		}
 		break;
 	}
@@ -1278,6 +1300,9 @@ int apic_reg_write(struct kvm_lapic *apic, uint32_t reg, uint32_t val)
 #ifdef XXX
 		hrtimer_cancel(&apic->lapic_timer.timer);
 #else
+		mutex_enter(&cpu_lock);
+		cyclic_remove(apic->lapic_timer.kvm_cyclic_id);
+		mutex_exit(&cpu_lock);
 		XXX_KVM_PROBE;
 #endif
 		apic_set_reg(apic, APIC_TMICT, val);
@@ -1343,6 +1368,70 @@ static const struct kvm_io_device_ops apic_mmio_ops = {
 	.write    = apic_mmio_write,
 };
 
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+	int restart_timer = 0;
+#ifdef XXX
+	wait_queue_head_t *q = &vcpu->wq;
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
+
+	/*
+	 * There is a race window between reading and incrementing, but we do
+	 * not care about potentially loosing timer events in the !reinject
+	 * case anyway.
+	 */
+	/* XXX may need protectionn on pending */
+	if (ktimer->reinject || !ktimer->pending) {
+	  atomic_add_32(&ktimer->pending, 1);
+		/* FIXME: this code should not know anything about vcpus */
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+	}
+
+#ifdef XXX
+	if (waitqueue_active(q))
+		wake_up_interruptible(q);
+#else
+	XXX_KVM_PROBE;
+#endif /*XXX*/
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+#ifdef XXX
+		kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+#else
+		XXX_KVM_PROBE;
+#endif /*XXX*/
+		restart_timer = 1;
+	}
+
+	return restart_timer;
+}
+
+void kvm_timer_fn(void *arg)
+{
+	struct kvm_timer *ktimer = (struct kvm_timer *)arg;
+	int restart_timer;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = ktimer->vcpu;
+	if (!vcpu)
+		return;
+
+	restart_timer = __kvm_timer_fn(vcpu, ktimer);
+}
+
+static int lapic_is_periodic(struct kvm_timer *ktimer)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)((caddr_t)ktimer
+						      - offsetof(struct kvm_lapic, lapic_timer));
+	return apic_lvtt_period(apic);
+}
+
+static struct kvm_timer_ops lapic_timer_ops = {
+	.is_periodic = lapic_is_periodic,
+};
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic;
@@ -1369,12 +1458,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = kvm_timer_fn;
+#else
+	apic->lapic_timer.kvm_cyc_handler.cyh_func = kvm_timer_fn;
+	apic->lapic_timer.kvm_cyc_handler.cyh_arg = &apic->lapic_timer;
+	apic->lapic_timer.kvm_cyc_handler.cyh_level = CY_HIGH_LEVEL;
+	XXX_KVM_PROBE;
+#endif /*XXX*/
+
 	apic->lapic_timer.t_ops = &lapic_timer_ops;
 	apic->lapic_timer.kvm = vcpu->kvm;
 	apic->lapic_timer.vcpu = vcpu;
-#else
-	XXX_KVM_PROBE;
-#endif
+
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
diff --git a/kvm_x86host.h b/kvm_x86host.h
index d40f573..a9dd705 100644
--- a/kvm_x86host.h
+++ b/kvm_x86host.h
@@ -357,6 +357,36 @@ struct i387_fxsave_struct {
 
 } __attribute__((aligned(16)));
 
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time.  There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done.  Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+	uint32_t   version;
+	uint32_t   pad0;
+	uint64_t   tsc_timestamp;
+	uint64_t   system_time;
+	uint32_t   tsc_to_system_mul;
+	char    tsc_shift;
+	unsigned char    pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
 struct kvm_vcpu_arch {
 	uint64_t host_tsc;
 	/*
@@ -437,9 +467,9 @@ struct kvm_vcpu_arch {
 	struct x86_emulate_ctxt emulate_ctxt;
 
 	gpa_t time;
-#ifdef XXX
+
 	struct pvclock_vcpu_time_info hv_clock;
-#endif /*XXX*/
+
 	unsigned int hv_clock_tsc_khz;
 	unsigned int time_offset;
 	page_t *time_page;
author	Max Bruning <max@joyent.com>	2011-04-19 10:50:19 -0700
committer	Max Bruning <max@joyent.com>	2011-04-19 10:50:19 -0700
commit	f7676d3a7b7ff8081be62e83d187c17dfc3cf15d (patch)
tree	8945691c02b720805be38a0b31ceb7859fd4e4ef
parent	dda119d58bb0d9faf48c70919a1079f8844ed5e2 (diff)
download	illumos-kvm-f7676d3a7b7ff8081be62e83d187c17dfc3cf15d.tar.gz