diff options
author | Robert Mustacchi <rm@joyent.com> | 2011-06-07 18:54:02 -0700 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2011-06-07 18:54:02 -0700 |
commit | b21ef367c9f8dfe7164769b2eea9efbb96628be1 (patch) | |
tree | b6c6baee753b9853e33ed3880601648245f655c1 /kvm_x86.c | |
parent | eb471cb1421d3beb5c77a2d2724df28b802de07d (diff) | |
download | illumos-kvm-b21ef367c9f8dfe7164769b2eea9efbb96628be1.tar.gz |
HVM-321 kvm.c and kvm_x86.c are confused
Diffstat (limited to 'kvm_x86.c')
-rw-r--r-- | kvm_x86.c | 5419 |
1 files changed, 4680 insertions, 739 deletions
@@ -1,38 +1,179 @@ +/* + * + */ + #include <sys/types.h> #include <sys/param.h> -#include <sys/errno.h> -#include <sys/modctl.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> +#include <sys/mutex.h> +#include <sys/ksynch.h> +#include <sys/condvar_impl.h> #include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/cpuvar.h> -#include <vm/hat_i86.h> -#include <sys/segments.h> -#include <sys/mman.h> -#include <sys/mach_mmu.h> -#include <sys/int_limits.h> -#include <sys/x_call.h> + +#include <vm/page.h> +#include <vm/hat.h> + +#include <asm/cpu.h> #include "kvm_bitops.h" -#include "kvm_apicdef.h" -#include "kvm_types.h" +#include "kvm_vmx.h" +#include "msr-index.h" +#include "msr.h" +#include "irqflags.h" #include "kvm_host.h" - -#include "kvm_coalesced_mmio.h" +#include "kvm_lapic.h" +#include "processor-flags.h" +#include "kvm_cpuid.h" +#include "hyperv.h" +#include "kvm_apicdef.h" +#include "kvm_iodev.h" +#include "kvm.h" +#include "kvm_x86impl.h" #include "kvm_irq.h" +#include "kvm_tss.h" +#include "kvm_ioapic.h" +#include "kvm_coalesced_mmio.h" #include "kvm_i8254.h" -#include "kvm_x86impl.h" +#include "kvm_mmu.h" +#include "kvm_cache_regs.h" -#undef DEBUG - -extern struct kvm_shared_msrs_global shared_msrs_global; -extern void shared_msr_update(unsigned slot, uint32_t msr); +/* XXX These don't belong here! */ extern caddr_t smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); +#ifdef XXX_KVM_DECLARATION +unsigned long *vmx_io_bitmap_a; +unsigned long *vmx_io_bitmap_b; +unsigned long *vmx_msr_bitmap_legacy; +unsigned long *vmx_msr_bitmap_longmode; +#else +/* make these arrays to try to force into low 4GB memory... */ +/* also need to be aligned... */ +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)]; +#endif + +static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)]; + +#define MAX_IO_MSRS 256 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +/* + * EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL; +#endif + +static void update_cr8_intercept(struct kvm_vcpu *); + +struct kvm_x86_ops *kvm_x86_ops; +int ignore_msrs = 0; + +static struct kvm_shared_msrs_global shared_msrs_global; + +static void +kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn) +{ + unsigned slot; + struct kvm_shared_msrs *locals = + (struct kvm_shared_msrs *)(((caddr_t)urn) - + offsetof(struct kvm_shared_msrs, urn)); + struct kvm_shared_msr_values *values; + + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; + } + } + locals->registered = 0; + kvm_user_return_notifier_unregister(vcpu, urn); +} + +static void +shared_msr_update(unsigned slot, uint32_t msr) +{ + struct kvm_shared_msrs *smsr; + uint64_t value; + smsr = shared_msrs[CPU->cpu_id]; + + /* + * only read, and nobody should modify it at this time, + * so don't need lock + */ + if (slot >= shared_msrs_global.nr) { + cmn_err(CE_WARN, "kvm: invalid MSR slot!"); + return; + } + + rdmsrl_safe(msr, (unsigned long long *)&value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void +kvm_define_shared_msr(unsigned slot, uint32_t msr) +{ + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot] = msr; +#ifdef XXX + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); +#else + XXX_KVM_SYNC_PROBE; +#endif +} + +static void +kvm_shared_msr_cpu_online(void) +{ + unsigned i; + + for (i = 0; i < shared_msrs_global.nr; i++) + shared_msr_update(i, shared_msrs_global.msrs[i]); +} + +void +kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value, + uint64_t mask) +{ + struct kvm_shared_msrs *smsr = shared_msrs[CPU->cpu_id]; + + if (((value ^ smsr->values[slot].curr) & mask) == 0) + return; + + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); + + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + kvm_user_return_notifier_register(vcpu, &smsr->urn); + smsr->registered = 1; + } +} + unsigned long segment_base(uint16_t selector) { @@ -64,253 +205,1353 @@ segment_base(uint16_t selector) return (v); } +uint64_t +kvm_get_apic_base(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return (vcpu->arch.apic_base); + else + return (vcpu->arch.apic_base); +} -struct kvm * -kvm_arch_create_vm(void) +void +kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data) { - struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP); + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; +} - if (!kvm) - return (NULL); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 - if ((kvm->arch.aliases = - kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) { - kmem_free(kvm, sizeof (struct kvm)); - return (NULL); +static int +exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return (EXCPT_PF); + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return (EXCPT_CONTRIBUTORY); + default: + break; } - list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page), - offsetof(struct kvm_mmu_page, link)); + return (EXCPT_BENIGN); +} - list_create(&kvm->arch.assigned_dev_head, - sizeof (struct kvm_assigned_dev_kernel), - offsetof(struct kvm_assigned_dev_kernel, list)); +static void +kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, int has_error, uint32_t error_code) +{ + uint32_t prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { +queue: + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || + (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = 1; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else { + /* + * replace previous exception with a new one in a hope + * that instruction re-execution will regenerate lost + * exception + */ + goto queue; + } +} - /* XXX - original is rdtscll() */ - kvm->arch.vm_init_tsc = (uint64_t)gethrtime(); +void +kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, 0, 0); +} - return (kvm); +void +kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + uint32_t error_code) +{ + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest); + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } -inline gpa_t -gfn_to_gpa(gfn_t gfn) +void +kvm_inject_nmi(struct kvm_vcpu *vcpu) { - return ((gpa_t)gfn << PAGESHIFT); + vcpu->arch.nmi_pending = 1; } -page_t *pfn_to_page(pfn_t pfn); +void +kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code) +{ + kvm_multiple_exception(vcpu, nr, 1, error_code); +} + +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +int +kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) +{ + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return (1); + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return (0); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int +load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGESHIFT; + unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2; + int i; + int ret; + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, + pdpte, offset * sizeof (uint64_t), sizeof (pdpte)); + + if (ret < 0) { + ret = 0; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(pdpte); i++) { + if (is_present_gpte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty); +out: + return (ret); +} + +static int +pdptrs_changed(struct kvm_vcpu *vcpu) +{ + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return (0); + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) { + return (1); + } + + if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, + pdpte, sizeof (pdpte)) < 0) + return (1); + + return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0); +} + +void +kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { + kvm_inject_gp(vcpu, 0); + return; + } +#endif + + cr0 &= ~CR0_RESERVED_BITS; + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->arch.efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + kvm_inject_gp(vcpu, 0); + return; + + } + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + kvm_mmu_reset_context(vcpu); +} + +void +kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); +} + +void +kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { + kvm_inject_gp(vcpu, 0); + return; + } + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && + ((cr4 ^ old_cr4) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (cr4 & X86_CR4_VMXE) { + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} + +void +kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + return; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT))) + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } +} + +void +kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; +} + +unsigned long +kvm_get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) { + return (kvm_lapic_get_cr8(vcpu)); + } else { + return (vcpu->arch.cr8); + } +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. + */ + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_SAVE_MSRS_BEGIN 5 +static uint32_t msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA +}; + +static unsigned num_msrs_to_save; + +static uint32_t emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +static void +set_efer(struct kvm_vcpu *vcpu, uint64_t efer) +{ + if (efer & efer_reserved_bits) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + vcpu->arch.efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} void -kvm_release_pfn_clean(pfn_t pfn) +kvm_enable_efer_bits(uint64_t mask) +{ + efer_reserved_bits &= ~mask; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int +kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data) +{ + return (kvm_x86_ops->set_msr(vcpu, msr_index, data)); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int +do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data) +{ + return (kvm_set_msr(vcpu, index, *data)); +} + +static void +kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { + static int version; + struct pvclock_wall_clock wc; + struct timespec boot; + + if (!wall_clock) + return; + + version++; + + kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); + + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) - put_page(pfn_to_page(pfn)); + getboottime(&boot); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); #else XXX_KVM_PROBE; #endif } -#ifdef IOMMU +static uint32_t +div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; -paddr_t -iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova) + /* + * Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" + */ + __asm__("divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor)); + + return (quotient); +} + +static void +kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) { - return (iommu_ops->iova_to_phys(domain, iova)); + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); } +/* XXX Expected to be per cpu */ +static uint64_t cpu_tsc_khz; +/* XXX extern?! */ +extern uint64_t cpu_freq_hz; -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) +static void +kvm_write_guest_time(struct kvm_vcpu *v) { - gfn_t gfn = base_gfn; - pfn_t pfn; - struct iommu_domain *domain = kvm->arch.iommu_domain; - unsigned long i; - uint64_t phys; + struct timespec ts; + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + unsigned long this_tsc_khz; - /* check if iommu exists and in use */ - if (!domain) + if ((!vcpu->time_page)) return; - for (i = 0; i < npages; i++) { - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - pfn = phys >> PAGESHIFT; - kvm_release_pfn_clean(pfn); - gfn++; + this_tsc_khz = cpu_tsc_khz; + if (vcpu->hv_clock_tsc_khz != this_tsc_khz) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } - iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages); +#ifdef XXX + put_cpu_var(cpu_tsc_khz); +#else + XXX_KVM_PROBE; +#endif + +#ifdef XXX + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); +#else + /* + * may need to mask interrupts for local_irq_save, and unmask + * for local_irq_restore. cli()/sti() might be done... + */ + XXX_KVM_PROBE; +#endif + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + gethrestime(&ts); +#ifdef XXX + monotonic_to_bootbased(&ts); + local_irq_restore(flags); +#else + XXX_KVM_PROBE; +#endif + + /* With all the info we got, fill in the values */ + + vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * + (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just increase by 2 at the end. + */ + vcpu->hv_clock.version += 2; + + shared_kaddr = page_address(vcpu->time_page); + + memcpy((void *)((uintptr_t)shared_kaddr + vcpu->time_offset), + &vcpu->hv_clock, sizeof (vcpu->hv_clock)); + + mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT); } static int -kvm_iommu_unmap_memslots(struct kvm *kvm) +kvm_request_guest_time_update(struct kvm_vcpu *v) { - int i; - struct kvm_memslots *slots; + struct kvm_vcpu_arch *vcpu = &v->arch; - slots = kvm->memslots; + if (!vcpu->time_page) + return (0); + + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + + return (1); +} - for (i = 0; i < slots->nmemslots; i++) { - kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, - slots->memslots[i].npages); +static int +msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return (1); + case 0x2f8: + return (1); } return (0); } -int -kvm_iommu_unmap_guest(struct kvm *kvm) +static int +valid_pat_type(unsigned t) { - struct iommu_domain *domain = kvm->arch.iommu_domain; + return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */ +} + +static int +valid_mtrr_type(unsigned t) +{ + return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */ +} - /* check if iommu exists and in use */ - if (!domain) +static int +mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + int i; + + if (!msr_mtrr_valid(msr)) return (0); - kvm_iommu_unmap_memslots(kvm); - iommu_domain_free(domain); + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return (0); + return (1); + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return (0); + return (valid_mtrr_type(data & 0xff)); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return (0); + return (1); + } + + /* variable MTRRs */ + return (valid_mtrr_type(data & 0xff)); +} + +static int +set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + struct mtrr_state_type *state = &vcpu->arch.mtrr_state; + + uint64_t *p = (uint64_t *)&state->fixed_ranges; + + if (!mtrr_valid(vcpu, msr, data)) + return (1); + + if (msr == MSR_MTRRdefType) { + state->def_type = data; + state->enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + + if (!is_mtrr_mask) { + pt = (uint64_t *)&state->var_ranges[idx].base_lo; + } else { + pt = (uint64_t *)&state->var_ranges[idx].mask_lo; + } + + *pt = data; + } + + kvm_mmu_reset_context(vcpu); + return (0); } -#endif /* IOMMU */ -static void -kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +static int +set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) { - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return (1); + if (data != 0 && data != ~(uint64_t)0) + return (-1); + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + /* + * only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10)) != ~(uint64_t)0) + return (-1); + vcpu->arch.mce_banks[offset] = data; + break; + } + return (1); + } + return (0); } -static void -kvm_free_vcpus(struct kvm *kvmp) +static int +xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data) { - int ii, maxcpus; + struct kvm *kvm = vcpu->kvm; + int lm = is_long_mode(vcpu); + uint8_t *blob_addr = lm ? + (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 : + (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32; + uint8_t blob_size = lm ? + kvm->arch.xen_hvm_config.blob_size_64 : + kvm->arch.xen_hvm_config.blob_size_32; + uint32_t page_num = data & ~PAGEMASK; + uint64_t page_addr = data & PAGEMASK; + uint8_t *page; + int r; - maxcpus = kvmp->online_vcpus; - XXX_KVM_SYNC_PROBE; - for (ii = 0; ii < maxcpus; ii++) - kvm_unload_vcpu_mmu(kvmp->vcpus[ii]); + r = E2BIG; + if (page_num >= blob_size) + goto out; + r = ENOMEM; + page = kmem_alloc(PAGESIZE, KM_SLEEP); + r = EFAULT; + if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE)) + goto out_free; + if (kvm_write_guest(kvm, page_addr, page, PAGESIZE)) + goto out_free; + r = 0; +out_free: + kmem_free(page, PAGESIZE); +out: + return (r); +} - for (ii = 0; ii < maxcpus; ii++) - kvm_arch_vcpu_free(kvmp->vcpus[ii]); - mutex_enter(&kvmp->lock); - for (ii = 0; ii < maxcpus; ii++) - kvmp->vcpus[ii] = NULL; - kvmp->online_vcpus = 0; - mutex_exit(&kvmp->lock); +static int +kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE); } -/* - * This function exists because of a difference in methodologies from our - * ancestor. With our ancestors, there is no imputus to clean up lists and - * mutexes. This is unfortunate, because they seem to even have debug kernels - * which would seemingly check for these kinds of things. But because in the - * common case mutex_exit is currently a #define to do {} while(0), it seems - * that they just ignore this. - * - * This leads to the following behavior: during our time we create a lot of - * auxillary structs potentially related to pits, apics, etc. Tearing down these - * structures relies on having the correct locks, etc. However - * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing - * the kmem_free. Logically these auxillary structures need to be freed and - * dealt with before we go back and do the rest of the tear down related to the - * device. - */ -void -kvm_arch_destroy_vm_comps(struct kvm *kvmp) +static int +kvm_hv_msr_partition_wide(uint32_t msr) { - if (kvmp == NULL) + int r = 0; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = 1; + break; + } -#ifdef IOMMU - kvm_iommu_unmap_guest(kvmp); + return (r); +} + +static int +set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + uint64_t gfn; + unsigned long addr; + uint8_t instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return (1); + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copyout(instructions, (caddr_t)addr, 4)) + return (1); + kvm->arch.hv_hypercall = data; + break; + } + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%lx\n", msr, data); + return (1); + } + + return (0); +} + +static int +clear_user(void *addr, unsigned long size) +{ + caddr_t ka; + int rval = 0; + + ka = kmem_zalloc(size, KM_SLEEP); + rval = copyout(ka, addr, size); + kmem_free(ka, size); + + return (rval); +} + +static int +set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + + addr = gfn_to_hva(vcpu->kvm, + data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + + if (kvm_is_error_hva(addr)) + return (1); + + if (clear_user((void *)addr, PAGESIZE)) + return (1); + + vcpu->arch.hv_vapic = data; + break; + } + + case HV_X64_MSR_EOI: + return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data)); + case HV_X64_MSR_ICR: + return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data)); + case HV_X64_MSR_TPR: + return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data)); + + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%lx\n", msr, data); + return (1); + } + + return (0); +} + +int +kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + switch (msr) { + case MSR_EFER: + set_efer(vcpu, data); + break; + case MSR_K7_HWCR: + data &= ~(uint64_t)0x40; /* ignore flush filter disable */ + if (data != 0) { + cmn_err(CE_NOTE, + "unimplemented HWCR wrmsr: 0x%lx\n", data); + return (1); + } + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%lx\n", data); + return (1); + } + break; + case MSR_AMD64_NB_CFG: + break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* + * Values other than LBR and BTF are vendor-specific, + * thus reserved and should throw a #GP + */ + return (1); + } + cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n", + __func__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + break; + case 0x200 ... 0x2ff: + return (set_msr_mtrr(vcpu, msr, data)); + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return (kvm_x2apic_msr_write(vcpu, msr, data)); + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { +#ifdef XXX + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } #else - XXX_KVM_PROBE; -#endif /* IOMMU */ - kvm_free_pit(kvmp); - kvm_free_vcpus(kvmp); - kvm_free_physmem(kvmp); + XXX_KVM_PROBE; +#endif + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1); #ifdef XXX -#ifdef APIC - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); - if (kvm->arch.ept_identity_pagetable) - put_page(kvm->arch.ept_identity_pagetable); -#endif /* APIC */ + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGESHIFT); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_request_guest_time_update(vcpu); #else - XXX_KVM_PROBE; -#endif /* XXX */ -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - cleanup_srcu_struct(&kvm->srcu); -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ + XXX_KVM_PROBE; +#endif + break; + } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return (set_msr_mce(vcpu, msr, data)); + + /* + * Performance counters are not protected by a CPUID bit, so we should + * check all of them in the generic path for the sake of cross vendor + * migration. Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%lx\n", msr, data); + break; + /* + * at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%lx\n", msr, data); + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_exit(&vcpu->kvm->lock); + return (r); + } else + return (set_msr_hyperv(vcpu, msr, data)); + break; + default: + if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) + return (xen_hvm_config(vcpu, data)); + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %lx\n", + msr, data); + return (1); + } else { + cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %lx\n", + msr, data); + break; + } + } + + return (0); } -void -kvm_arch_destroy_vm(struct kvm *kvmp) +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int +kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata) { - if (kvmp == NULL) - return; /* nothing to do here */ + return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata)); +} - if (kvmp->arch.aliases) { - kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases)); - kvmp->arch.aliases = NULL; +static int +get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + struct mtrr_state_type *state = &vcpu->arch.mtrr_state; + uint64_t *p = (uint64_t *)&state->fixed_ranges; + + if (!msr_mtrr_valid(msr)) + return (1); + + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) { + pt = (uint64_t *)&state->var_ranges[idx].base_lo; + } else { + pt = (uint64_t *)&state->var_ranges[idx].mask_lo; + } + + *pdata = *pt; } - kmem_free(kvmp, sizeof (struct kvm)); + + return (0); } -#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ -#define MSR_IA32_FEATURE_CONTROL 0x0000003a +static int +get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + data = 0; + break; + case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return (1); + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return (1); + } + *pdata = data; + return (0); +} -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) +static int +get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data = 0; + struct kvm *kvm = vcpu->kvm; -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return (1); + } -void -kvm_shared_msr_cpu_online(void) + *pdata = data; + + return (0); +} + +static int +get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) { - unsigned i; + uint64_t data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata)); + case HV_X64_MSR_ICR: + return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata)); + case HV_X64_MSR_TPR: + return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata)); + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return (1); + } - for (i = 0; i < shared_msrs_global.nr; i++) - shared_msr_update(i, shared_msrs_global.msrs[i]); + *pdata = data; + return (0); } int -kvm_arch_hardware_enable(void *garbage) +kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + + switch (msr) { + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_UCODE_REV: + case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: + case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: + data = 0; + break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return (get_msr_mtrr(vcpu, msr, pdata)); + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return (kvm_x2apic_msr_read(vcpu, msr, pdata)); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + data = 1000ULL; + /* CPU multiplier */ + data |= (((uint64_t)4ULL) << 40); + break; + case MSR_EFER: + data = vcpu->arch.efer; + break; + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return (get_msr_mce(vcpu, msr, pdata)); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_exit(&vcpu->kvm->lock); + return (r); + } else + return (get_msr_hyperv(vcpu, msr, pdata)); + break; + default: + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr); + return (1); + } else { + cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; + } + *pdata = data; + + return (0); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int +__msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, uint64_t *data)) { + int i, idx; + + vcpu_load(vcpu); + #ifdef XXX - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; + idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + for (i = 0; i < msrs->nmsrs; i++) { + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; } + +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, idx); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - kvm_shared_msr_cpu_online(); + vcpu_put(vcpu); - return (kvm_x86_ops->hardware_enable(garbage)); + return (i); } -void -kvm_arch_hardware_disable(void *garbage) +int +kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) { - kvm_x86_ops->hardware_disable(garbage); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - drop_user_return_notifiers(garbage); -#endif + int r; + + if (msrs->nmsrs >= MAX_IO_MSRS) + return (-E2BIG); + + if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0) + return (r); + + *rv = r; + + return (0); } -static inline int -iommu_found(void) +int +kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) { + int r; + + if (msrs->nmsrs >= MAX_IO_MSRS) + return (-E2BIG); + + if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0) + return (-EINVAL); + + *rv = r; + return (0); } @@ -374,7 +1615,7 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p) r = EINVAL; break; case KVM_CAP_IOMMU: - *rval_p = iommu_found(); + *rval_p = 0; r = DDI_SUCCESS; break; case KVM_CAP_MCE: @@ -389,598 +1630,3166 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p) return (r); } -static inline int -apic_x2apic_mode(struct kvm_lapic *apic) +/* XXX Some part of kvm_ioctl goes here? */ + +void +kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - return (apic->vcpu->arch.apic_base & X2APIC_ENABLE); + kvm_x86_ops->vcpu_load(vcpu, cpu); +#ifdef XXX + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { + unsigned long khz = cpufreq_quick_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } +#else + XXX_KVM_PROBE; +#endif + kvm_request_guest_time_update(vcpu); } void -kvm_inject_nmi(struct kvm_vcpu *vcpu) +kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = 1; + kvm_put_guest_fpu(vcpu); + + kvm_x86_ops->vcpu_put(vcpu); +} + +static int +is_efer_nx(void) +{ + unsigned long long efer = 0; + + rdmsrl_safe(MSR_EFER, &efer); + return (efer & EFER_NX); } int -kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +{ + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + return (E2BIG); + + bcopy(cpuid->entries, vcpu->arch.cpuid_entries, + cpuid->nent * sizeof (struct kvm_cpuid_entry2)); + + vcpu_load(vcpu); + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { - page_t *page; - struct kvm *kvm; int r; + struct kvm_cpuid_entry2 *entries = cpuid->entries; - kvm = vcpu->kvm; + cpuid->nent = vcpu->arch.cpuid_nent; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + if (cpuid->nent < vcpu->arch.cpuid_nent) + return (E2BIG); - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + bcopy(&vcpu->arch.cpuid_entries, cpuid->entries, + vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2)); + return (0); +} + +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + __asm__ volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +#define __cpuid native_cpuid + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void +cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +static void +do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + + +#define F(x) bit(X86_FEATURE_##x) + +static void +do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function, + uint32_t index, int *nent, int maxnent) +{ + unsigned int ddic; + unsigned f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; + unsigned f_lm = F(LM); +#else + unsigned f_gbpages = 0; + unsigned f_lm = 0; +#endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + + /* cpuid 1.edx */ + const uint32_t kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const uint32_t kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const uint32_t kvm_supported_word4_x86_features = + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved, XSAVE, OSXSAVE */; + /* cpuid 0x80000001.ecx */ + const uint32_t kvm_supported_word6_x86_features = + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; + + /* all calls to cpuid_count() should be made on the same cpu */ + /* XXX - right now, system panics at ddi_exit_critical() */ + /* XXX - to run everything on same cpu, bind qemu at startup */ + + kpreempt_disable(); + + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (uint32_t)0xb); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + entry->ecx &= kvm_supported_word4_x86_features; + /* + * we support x2apic emulation even if host does not support + * it since we emulate x2apic in software + */ + entry->ecx |= F(X2APIC); + break; /* - * page = alloc_page(PAGESIZE, KM_SLEEP); - * if (!page) { - * r = ENOMEM; - * goto fail; - * } - * vcpu->arch.pio_data = page_address(page); + * function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - vcpu->arch.pio_data = (caddr_t)vcpu->run + - (KVM_PIO_PAGE_OFFSET * PAGESIZE); + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int i, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + cache_type = entry[i - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int i, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + level_type = entry[i - 1].ecx & 0xff00; + if (!level_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + entry->ecx &= kvm_supported_word6_x86_features; + break; + } + /* + * XXX - see comment above for ddi_enter_critical() + * + * ddi_exit_critical(ddic); + */ + kpreempt_enable(); +} - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail; +#undef F - if (irqchip_in_kernel(kvm)) { - r = kvm_create_lapic(vcpu); - if (r < 0) - goto fail_mmu_destroy; - } +int +kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = E2BIG; + uint32_t func; + int allocsize = 0; - vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * - sizeof (uint64_t) * 4, KM_SLEEP); + if (cpuid->nent < 1) + goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; + r = ENOMEM; + allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent; + cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP); + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + + r = E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + r = E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + r = EFAULT; + if (copyout(cpuid_entries, entries, + nent * sizeof (kvm_cpuid_entry2_t))) + goto out_free; + + cpuid->nent = nent; + r = 0; + +out_free: + kmem_free(cpuid_entries, allocsize); +out: + return (r); +} - if (!vcpu->arch.mce_banks) { - r = ENOMEM; - goto fail_free_lapic; - } +int +kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s)); + vcpu_put(vcpu); - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return (0); +} + +int +kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s)); + kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); + vcpu_put(vcpu); return (0); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail: - return (r); } -void -kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +int +kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 * - KVM_MAX_MCE_BANKS); - kvm_free_lapic(vcpu); - kvm_mmu_destroy(vcpu); + if (irq->irq < 0 || irq->irq >= 256) + return (-EINVAL); + + if (irqchip_in_kernel(vcpu->kvm)) + return (-ENXIO); + + vcpu_load(vcpu); + + kvm_queue_interrupt(vcpu, irq->irq, 0); + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp) +{ + int rval; + uint64_t mcg_cap = *mcg_capp; + unsigned bank_num = mcg_cap & 0xff, bank; + + rval = -EINVAL; + if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + rval = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(uint64_t)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(uint64_t)0; +out: + return (rval); } +int +kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; + events->exception.nr = vcpu->arch.exception.nr; + events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.error_code = vcpu->arch.exception.error_code; + + events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.nr = vcpu->arch.interrupt.nr; + events->interrupt.soft = vcpu->arch.interrupt.soft; + + events->nmi.injected = vcpu->arch.nmi_injected; + events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + + events->sipi_vector = vcpu->arch.sipi_vector; + + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | + KVM_VCPUEVENT_VALID_SIPI_VECTOR); + + vcpu_put(vcpu); + + return (0); +} int -kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) { - int r; + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | + KVM_VCPUEVENT_VALID_SIPI_VECTOR)) + return (-EINVAL); + + vcpu_load(vcpu); + + vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.has_error_code = events->exception.has_error_code; + vcpu->arch.exception.error_code = events->exception.error_code; + + vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.nr = events->interrupt.nr; + vcpu->arch.interrupt.soft = events->interrupt.soft; - mutex_init(&vcpu->mutex, NULL, MUTEX_DRIVER, 0); - vcpu->cpu = -1; - vcpu->kvm = kvm; - vcpu->vcpu_id = id; + if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + + vcpu->arch.nmi_injected = events->nmi.injected; + + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; + + kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr) +{ + /* + * XXX They have some other code here to check the validity of the + * address + */ + return (kvm_x86_ops->set_tss_addr(kvmp, addr)); +} + +int +kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return (0); +} + +gfn_t +unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; #ifdef XXX - init_waitqueue_head(&vcpu->wq); + aliases = rcu_dereference(kvm->arch.aliases); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; + aliases = kvm->arch.aliases; #endif - vcpu->run = ddi_umem_alloc(PAGESIZE * 2, DDI_UMEM_SLEEP, &vcpu->cookie); - r = kvm_arch_vcpu_init(vcpu); + for (i = 0; i < aliases->naliases; i++) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn && + gfn < alias->base_gfn + alias->npages) + return (alias->target_gfn + gfn - alias->base_gfn); + } - if (r != 0) { - vcpu->run = NULL; - ddi_umem_free(vcpu->cookie); - return (r); + return (gfn); +} + +gfn_t +unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + /* XXX need protection */ + aliases = kvm->arch.aliases; + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (gfn >= alias->base_gfn && + gfn < alias->base_gfn + alias->npages) + return (alias->target_gfn + gfn - alias->base_gfn); } + return (gfn); +} + +int +kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], + sizeof (struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], + sizeof (struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = EINVAL; + break; + } + + return (r); +} + +int +kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + mutex_enter(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, + sizeof (struct kvm_pic_state)); + mutex_exit(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_PIC_SLAVE: + mutex_enter(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, + sizeof (struct kvm_pic_state)); + mutex_exit(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = EINVAL; + break; + } + + kvm_pic_update_irq(pic_irqchip(kvm)); + + return (r); +} + +int +kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + struct kvm_pit *vpit = kvm->arch.vpit; + + mutex_enter(&vpit->pit_state.lock); + memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels)); + ps->flags = vpit->pit_state.flags; + mutex_exit(&vpit->pit_state.lock); + + return (0); +} + +int +kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + boolean_t prev_legacy, cur_legacy, start = B_FALSE; + struct kvm_pit *vpit = kvm->arch.vpit; + + mutex_enter(&vpit->pit_state.lock); + prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + + if (!prev_legacy && cur_legacy) + start = B_TRUE; + + memcpy(&vpit->pit_state.channels, &ps->channels, + sizeof (vpit->pit_state.channels)); + + vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start); + + mutex_exit(&vpit->pit_state.lock); + + return (0); +} + +/* TODO: As Pascal would say, we can do better */ +int +kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg) +{ + + struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg; + struct kvm_msr_list *msr_list; + size_t sz = sizeof (struct kvm_msr_list); + unsigned n; + + msr_list = kmem_zalloc(sz, KM_SLEEP); + + if (copyin(user_msr_list, msr_list, sz) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + n = msr_list->nmsrs; + msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + + if (copyout(msr_list, user_msr_list, sz) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + if (n < msr_list->nmsrs) { + kmem_free(msr_list, sz); + return (E2BIG); + } + + if (copyout(&msrs_to_save, user_msr_list->indices, + num_msrs_to_save * sizeof (uint32_t))) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + if (copyout(&emulated_msrs, user_msr_list->indices + + num_msrs_to_save, ARRAY_SIZE(emulated_msrs) * + sizeof (uint32_t)) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + kmem_free(msr_list, sz); return (0); } /* - * For pages for which vmx needs physical addresses, - * linux allocates pages from an area that maps virtual - * addresses 1-1 with physical memory. In this way, - * translating virtual to physical just involves subtracting - * the start of the area from the virtual address. - * This solaris version uses kmem_alloc, so there is no - * direct mapping of virtual to physical. We'll change this - * later if performance is an issue. For now, we'll use - * hat_getpfnum() to do the conversion. Also note that - * we're assuming 64-bit address space (we won't run on - * 32-bit hardware). + * Get (and clear) the dirty memory log for a memory slot. */ -uint64_t -kvm_va2pa(caddr_t va) +int +kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - uint64_t pa; + int r, i; + struct kvm_memory_slot *memslot; + unsigned long n; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET); - return (pa); -} + mutex_enter(&kvm->slots_lock); -#ifdef XXX_KVM_DECLARATION -unsigned long *vmx_io_bitmap_a; -unsigned long *vmx_io_bitmap_b; -unsigned long *vmx_msr_bitmap_legacy; -unsigned long *vmx_msr_bitmap_longmode; + r = EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + dirty_bitmap = kmem_alloc(n, KM_SLEEP); + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n / sizeof (long); i++) + is_dirty = memslot->dirty_bitmap[i]; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + + mutex_enter(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + mutex_exit(&kvm->mmu_lock); + + slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; +#ifdef XXX + rcu_assign_pointer(kvm->memslots, slots); + kvm_synchronize_srcu_expedited(&kvm->srcu); #else -/* make these arrays to try to force into low 4GB memory... */ -/* also need to be aligned... */ -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)]; + kvm->memslots = slots; + XXX_KVM_SYNC_PROBE; #endif + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kmem_free(old_slots, sizeof (struct kvm_memslots)); + } -struct kvm_vcpu * -kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) -{ - char buf[32]; - struct kvm_vcpu *vcpu; - kstat_t *kstat; + r = 0; + if (copyout(dirty_bitmap, log->v.dirty_bitmap, n) != 0) + r = EFAULT; +out_free: + kmem_free(dirty_bitmap, n); +out: + mutex_exit(&kvm->slots_lock); + return (r); +} - (void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid); +/* XXX kvm_arch_vm_ioctl */ - if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED, - sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL)) == NULL) { - return (NULL); +static void +kvm_init_msr_list(void) +{ + uint32_t dummy[2]; + unsigned i, j; + + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; } + num_msrs_to_save = j; +} - vcpu = kvm_x86_ops->vcpu_create(kvm, id); +static int +vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return (0); - if (vcpu == NULL) { - kstat_delete(kstat); - return (NULL); - } + return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); +} - vcpu->kvcpu_kstat = kstat; - vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats; +static int +vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return (0); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id"); - vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid; + return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); +} - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid"); - vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid; +gpa_t +kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail, - "inst-emulation-fail"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls"); + return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); +} - kstat_install(vcpu->kvcpu_kstat); +gpa_t +kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - return (vcpu); + access |= PFERR_WRITE_MASK; + + return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); } -void -kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +static int +kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t access, uint32_t *error) { - if (vcpu->arch.time_page) { - /* XXX We aren't doing anything with the time page */ - XXX_KVM_PROBE; - vcpu->arch.time_page = NULL; + uintptr_t data = (uintptr_t)val; + int r = 0; /* X86EMUL_CONTINUE */ + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, + access, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned toread = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = 1; /* X86EMUL_PROPAGATE_FAULT */ + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, (void *)data, toread); + if (ret < 0) { + r = 1; /* X86EMUL_UNHANDLEABLE */ + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; } +out: + return (r); +} - if (vcpu->kvcpu_kstat != NULL) - kstat_delete(vcpu->kvcpu_kstat); +/* used for instruction fetching */ +static int +kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - kvm_x86_ops->vcpu_free(vcpu); + return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error)); } +static int +kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; -uint64_t -kvm_get_apic_base(struct kvm_vcpu *vcpu) + return (kvm_read_guest_virt_helper(addr, val, + bytes, vcpu, access, error)); +} + +static int +kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) { - if (irqchip_in_kernel(vcpu->kvm)) - return (vcpu->arch.apic_base); - else - return (vcpu->arch.apic_base); + return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error)); } -void -kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data) +static int +kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) { - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; + uintptr_t data = (uintptr_t)val; + + while (bytes) { + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) + return (X86EMUL_PROPAGATE_FAULT); + + if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0) + return (X86EMUL_UNHANDLEABLE); + + bytes -= towrite; + data += towrite; + addr += towrite; + } + + return (0); } -void -kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +static int +emulator_read_emulated(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; + gpa_t gpa; + uint32_t error_code; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, + vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val); + + vcpu->mmio_read_completed = 0; + return (X86EMUL_CONTINUE); } - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_tpr(vcpu, cr8); - else - vcpu->arch.cr8 = cr8; + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return (X86EMUL_PROPAGATE_FAULT); + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (kvm_read_guest_virt(addr, val, + bytes, vcpu, NULL) == X86EMUL_CONTINUE) + return (X86EMUL_CONTINUE); + +mmio: + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa, + uint64_t, *(uint64_t *)val); + return (X86EMUL_CONTINUE); + } + + KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes, + uintptr_t, gpa); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return (X86EMUL_UNHANDLEABLE); } int -is_paging(struct kvm_vcpu *vcpu) +emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) { - return (kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + int ret; + + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + + if (ret < 0) + return (0); + + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + + return (1); } -unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)]; +static int +emulator_write_emulated_onepage(unsigned long addr, const void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + uint32_t error_code; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return (X86EMUL_PROPAGATE_FAULT); + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return (X86EMUL_CONTINUE); + +mmio: + KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa, + uint64_t, *(uint64_t *)val); + + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + return (X86EMUL_CONTINUE); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return (X86EMUL_CONTINUE); +} int -kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +emulator_write_emulated(unsigned long addr, const void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) { - return (kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len)); + uintptr_t data = (uintptr_t)val; + + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGEMASK) { + int rc, now; + + now = -addr & ~PAGEMASK; + rc = emulator_write_emulated_onepage(addr, + (void *)data, now, vcpu); + + if (rc != X86EMUL_CONTINUE) + return (rc); + + addr += now; + data += now; + bytes -= now; + } + + return (emulator_write_emulated_onepage(addr, val, bytes, vcpu)); +} + +static int +emulator_cmpxchg_emulated(unsigned long addr, const void *old, + const void *new, unsigned int bytes, struct kvm_vcpu *vcpu) +{ + cmn_err(CE_WARN, "kvm: emulating exchange as write\n"); +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + page_t page; + char *kaddr; + uint64_t val; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK)) + goto emul_write; + + val = *(uint64_t *)new; + + page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT); + kaddr = kmap_atomic(page, KM_USER0); + + set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + } +emul_write: +#endif + + return (emulator_write_emulated(addr, new, bytes, vcpu)); +} + +static unsigned long +get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return (kvm_x86_ops->get_segment_base(vcpu, seg)); } void -fx_init(struct kvm_vcpu *vcpu) +kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - unsigned after_mxcsr_mask; + uint8_t opcodes[4]; + unsigned long rip = kvm_rip_read(vcpu); + unsigned long rip_linear; + #ifdef XXX - /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. - */ - if (!used_math()) + if (!printk_ratelimit()) + return; #else XXX_KVM_PROBE; #endif - kvm_fx_save(&vcpu->arch.host_fx_image); - /* Initialize guest FPU by resetting ours and saving into guest's */ - kpreempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - kpreempt_enable(); + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image + - after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) - - after_mxcsr_mask); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); + + cmn_err(CE_WARN, "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); +} + +static struct x86_emulate_ops emulate_ops = { + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +static void +cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; } int -kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + uint16_t error_code, int emulation_type) { - vcpu->arch.nmi_pending = 0; - vcpu->arch.nmi_injected = 0; + int r, shadow_mask; + struct decode_cache *c; + struct kvm_run *run = vcpu->run; - vcpu->arch.switch_db_regs = 0; - memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; - vcpu->arch.dr7 = DR7_FIXED_1; + kvm_clear_exception_queue(vcpu); + vcpu->arch.mmio_fault_cr2 = cr2; - return (kvm_x86_ops->vcpu_reset(vcpu)); + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); + + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? + X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ? + X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : + cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + /* + * Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall) + */ + c = &vcpu->arch.emulate_ctxt.decode; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return (EMULATE_FAIL); + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return (EMULATE_FAIL); + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return (EMULATE_FAIL); + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return (EMULATE_FAIL); + break; + default: + return (EMULATE_FAIL); + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return (EMULATE_FAIL); + } + + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation); + + if (r) { + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail); + + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return (EMULATE_DONE); + return (EMULATE_FAIL); + } + } + + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return (EMULATE_DONE); + } + + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.string) + return (EMULATE_DO_MMIO); + + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return (EMULATE_DONE); + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return (EMULATE_FAIL); + } + + return (EMULATE_DO_MMIO); + } + + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return (EMULATE_DO_MMIO); + } + + return (EMULATE_DONE); } -struct kvm_memory_slot * -gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +static int +pio_copy_data(struct kvm_vcpu *vcpu) { - gfn = unalias_gfn(kvm, gfn); - return (gfn_to_memslot_unaliased(kvm, gfn)); + void *p = vcpu->arch.pio_data; + gva_t q = vcpu->arch.pio.guest_gva; + unsigned bytes; + int ret; + uint32_t error_code; + + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + + if (vcpu->arch.pio.in) + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); + else + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + + return (ret); } -unsigned long -kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +int +complete_pio(struct kvm_vcpu *vcpu) { - struct vm_area_struct *vma; - unsigned long addr, size; + struct kvm_pio_request *io = &vcpu->arch.pio; + long delta; + int r; + unsigned long val; - size = PAGESIZE; + if (!io->string) { + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) + goto out; + } - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return (PAGESIZE); + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } + } +out: + io->count -= io->cur_count; + io->cur_count = 0; -#ifdef XXX - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) + return (0); +} + +static int +kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) { + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + } else { + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, pd); + } + + return (r); +} + +int +kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) +{ + unsigned long val; + + DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size, + unsigned long, 1) + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE; + vcpu->run->io.count = vcpu->arch.pio.count = + vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.rep = 0; + + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + complete_pio(vcpu); + return (1); + } + + return (0); +} + +void +kvm_timer_fire(void *arg) +{ + struct kvm_timer *timer = (struct kvm_timer *)arg; + struct kvm_vcpu *vcpu = timer->vcpu; + + if (vcpu == NULL) + return; + + mutex_enter(&vcpu->kvcpu_kick_lock); + + if (timer->reinject || !timer->pending) { + atomic_add_32(&timer->pending, 1); + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + } + + timer->intervals++; + + cv_broadcast(&vcpu->kvcpu_kick_cv); + mutex_exit(&vcpu->kvcpu_kick_lock); +} + +static void +kvm_timer_init(void) +{ + int cpu; + + /* + * XXX We assume that any machine running solaris kvm + * has constant time stamp counter increment rate. + * This will be true for all but older machines. + */ + /* assume pi_clock in mhz */ + cpu_tsc_khz = (cpu_freq_hz / 1000); +} + +int +kvm_arch_init(void *opaque) +{ + int r; + struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + + if (ops->cpu_has_kvm_support()) { + cmn_err(CE_WARN, "kvm: no hardware support\n"); + r = ENOTSUP; goto out; + } + if (ops->disabled_by_bios()) { + cmn_err(CE_WARN, "kvm: disabled by bios\n"); + r = ENOTSUP; + goto out; + } - size = vma_kernel_pagesize(vma); + r = kvm_mmu_module_init(); + if (r) + goto out; + + kvm_init_msr_list(); + + kvm_x86_ops = ops; + kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_base_ptes(PT_PRESENT_MASK); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0); + + kvm_timer_init(); + + return (0); out: - up_read(¤t->mm->mmap_sem); - return (size); + return (r); +} + +int +kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits); + + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + return (1); + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return (0); + } +} + +int +kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + uint64_t param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + int fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return (0); + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RAX) & 0xffffffff); + + ingpa = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RCX) & 0xffffffff); + + outgpa = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast, + uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa, + uintptr_t, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: +#ifdef XXX + kvm_vcpu_on_spin(vcpu); #else - XXX_KVM_PROBE; - return (PAGESIZE); + XXX_KVM_PROBE; #endif + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((uint64_t)rep_done & 0xfff) << 32); + + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return (1); } -static pfn_t -hva_to_pfn(struct kvm *kvm, unsigned long addr) +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 + +int +kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - page_t page[1]; - int npages; - pfn_t pfn; - proc_t *procp = ttoproc(curthread); - struct as *as = procp->p_as; + unsigned long nr, a0, a1, a2, a3, ret; + int r = 1; + + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return (kvm_hv_hypercall(vcpu)); + + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + + KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1, + uintptr_t, a2, uintptr_t, a3); + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -EPERM; + goto out; + } + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_MMU_OP: #ifdef XXX + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); +#else + XXX_KVM_PROBE; + ret = -ENOSYS; +#endif + break; + default: + ret = -ENOSYS; + break; + } +out: + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - npages = get_user_pages_fast(addr, 1, 1, page); + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls); - if (unlikely(npages != 1)) { - struct vm_area_struct *vma; + return (r); +} - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); +static int +move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return (j); + } + } - if (vma == NULL || addr < vma->vm_start || - !(vma->vm_flags & VM_PFNMAP)) { - up_read(¤t->mm->mmap_sem); - get_page(bad_page); - return (page_to_pfn(bad_page)); + return (0); /* silence gcc, even though control never reaches here */ +} + +/* + * find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) + */ +static int +is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + uint32_t function, uint32_t index) +{ + if (e->function != function) + return (0); + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return (0); + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return (0); + return (1); +} + +struct kvm_cpuid_entry2 * +kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } - pfn = ((addr - vma->vm_start) >> PAGESHIFT) + vma->vm_pgoff; - up_read(¤t->mm->mmap_sem); - BUG_ON(!kvm_is_mmio_pfn(pfn)); - } else - pfn = page_to_pfn(page[0]); -#else - XXX_KVM_PROBE; - if (addr < kernelbase) - pfn = hat_getpfnum(as->a_hat, (caddr_t)addr); + return (best); +} + +int +cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + return (36); /* from linux. number of bits, perhaps? */ +} + +void +kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + uint32_t function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); + if (best) { + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); + } + kvm_x86_ops->skip_emulated_instruction(vcpu); + + KVM_TRACE5(cpuid, uint32_t, function, + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RAX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RBX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RCX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RDX)); +} + +static int +dm_request_for_irq_injection(struct kvm_vcpu *vcpu) +{ + return (!irqchip_in_kernel(vcpu->kvm) && + !kvm_cpu_has_interrupt(vcpu) && + vcpu->run->request_interrupt_window && + kvm_arch_interrupt_allowed(vcpu)); +} + +static void +post_kvm_run_save(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = kvm_get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; else - pfn = hat_getpfnum(kas.a_hat, (caddr_t)addr); + kvm_run->ready_for_interrupt_injection = + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); +} + +static void +vapic_enter(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + page_t *page; + + if (!apic || !apic->vapic_addr) + return; + + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); + + vcpu->arch.apic->vapic_page = page; +} + +static void +vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int idx; + + if (!apic || !apic->vapic_addr) + return; +#ifdef XXX + idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, idx); +#else + XXX_KVM_SYNC_PROBE; #endif - return (pfn); } -pfn_t -gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static void +update_cr8_intercept(struct kvm_vcpu *vcpu) { - unsigned long addr; - pfn_t pfn; + int max_irr, tpr; - addr = gfn_to_hva(kvm, gfn); + if (!kvm_x86_ops->update_cr8_intercept) + return; - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return (page_to_pfn(bad_page)); - } + if (!vcpu->arch.apic) + return; + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; - pfn = hva_to_pfn(kvm, addr); + if (max_irr != -1) + max_irr >>= 4; + tpr = kvm_lapic_get_cr8(vcpu); - return (pfn); + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } +static void +inject_pending_event(struct kvm_vcpu *vcpu) +{ + /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + return; + } -int -is_error_pfn(pfn_t pfn) + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = 0; + vcpu->arch.nmi_injected = 1; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + 0); + kvm_x86_ops->set_irq(vcpu); + } + } +} + +static inline unsigned long +native_get_debugreg(int regno) { - return (pfn == bad_pfn); + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + __asm__("mov %%db0, %0" :"=r" (val)); + break; + case 1: + __asm__("mov %%db1, %0" :"=r" (val)); + break; + case 2: + __asm__("mov %%db2, %0" :"=r" (val)); + break; + case 3: + __asm__("mov %%db3, %0" :"=r" (val)); + break; + case 6: + __asm__("mov %%db6, %0" :"=r" (val)); + break; + case 7: + __asm__("mov %%db7, %0" :"=r" (val)); + break; + default: + cmn_err(CE_WARN, "kvm: invalid debug register retrieval, " + "regno = %d\n", regno); + } + + return (val); } -page_t * -pfn_to_page(pfn_t pfn) +static inline void +native_set_debugreg(int regno, unsigned long value) { - return (page_numtopp_nolock(pfn)); + switch (regno) { + case 0: + __asm__("mov %0, %%db0" ::"r" (value)); + break; + case 1: + __asm__("mov %0, %%db1" ::"r" (value)); + break; + case 2: + __asm__("mov %0, %%db2" ::"r" (value)); + break; + case 3: + __asm__("mov %0, %%db3" ::"r" (value)); + break; + case 6: + __asm__("mov %0, %%db6" ::"r" (value)); + break; + case 7: + __asm__("mov %0, %%db7" ::"r" (value)); + break; + default: + cmn_err(CE_WARN, "kvm: invalid debug register set, " + "regno = %d\n", regno); + } } -void -kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn) +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) + +static int +vcpu_enter_guest(struct kvm_vcpu *vcpu) { + int r; + + int req_int_win = !irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + kvm_mmu_unload(vcpu); + } + + r = kvm_mmu_reload(vcpu); + + if (r) + goto out; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, + &vcpu->requests)) { + __kvm_migrate_timers(vcpu); + } + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, + &vcpu->requests)) { + kvm_write_guest_time(vcpu); + } + + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); + + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); + + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + r = 0; + goto out; + } + + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, + &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } + } + + kpreempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); + + cli(); + + clear_bit(KVM_REQ_KICK, &vcpu->requests); #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); + smp_mb__after_clear_bit(); #else XXX_KVM_PROBE; #endif -} + if (vcpu->requests || issig(JUSTLOOKING)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); + sti(); + kpreempt_enable(); + r = 1; + goto out; + } -void -kvm_set_pfn_dirty(pfn_t pfn) -{ + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) - SetPageDirty(page); /* XXX - not defined in linux?! */ + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_PROBE; +#endif + kvm_guest_enter(); + + if (vcpu->arch.switch_db_regs) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); } + + KVM_TRACE1(vm__entry, int, vcpu->vcpu_id); + + kvm_x86_ops->run(vcpu); +#ifdef XXX + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); #else XXX_KVM_PROBE; #endif -} + set_bit(KVM_REQ_KICK, &vcpu->requests); + sti(); -int -memslot_id(struct kvm *kvm, gfn_t gfn) -{ - int i; -#ifdef XXX_KVM_DECLARATION - struct kvm_memslots *slots = rcu_dereference(kvm->memslots); +#ifdef XXX + local_irq_enable(); /* XXX - should be ok with kpreempt_enable below */ + + barrier(); #else - struct kvm_memslots *slots = kvm->memslots; + XXX_KVM_PROBE; #endif - struct kvm_memory_slot *memslot = NULL; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_exits); + kvm_guest_exit(); - gfn = unalias_gfn(kvm, gfn); - for (i = 0; i < slots->nmemslots; ++i) { - memslot = &slots->memslots[i]; + kpreempt_enable(); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) - break; + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } +#else + XXX_KVM_PROBE; +#endif + kvm_lapic_sync_from_vapic(vcpu); + r = kvm_x86_ops->handle_exit(vcpu); - return (memslot - slots->memslots); +out: + return (r); } -void -kvm_release_pfn_dirty(pfn_t pfn) +static int +__vcpu_run(struct kvm_vcpu *vcpu) { - kvm_set_pfn_dirty(pfn); - kvm_release_pfn_clean(pfn); -} + int r; + struct kvm *kvm = vcpu->kvm; -int -cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - return (36); /* from linux. number of bits, perhaps? */ -} + if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r) + return (r); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu); + else { +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + kvm_vcpu_block(vcpu); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + if (test_and_clear_bit(KVM_REQ_UNHALT, + &vcpu->requests)) { + switch (vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } + } + + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_irq_exits); + } + + if (issig(JUSTLOOKING)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_signal_exits); + } + } +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + post_kvm_run_save(vcpu); + vapic_exit(vcpu); + + return (r); +} int -kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int r; - unsigned long addr; - gfn_t gfn = gpa >> PAGESHIFT; - int offset = offset_in_page(gpa); + sigset_t sigsaved; + struct kvm_run *kvm_run = vcpu->run; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return (-EFAULT); + vcpu_load(vcpu); + if (vcpu->sigset_active) + kvm_sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) { + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + r = -EAGAIN; + goto out; + } + + /* re-sync apic's tpr */ + if (!irqchip_in_kernel(vcpu->kvm)) + kvm_set_cr8(vcpu, kvm_run->cr8); + + if (vcpu->arch.pio.cur_count) { #ifdef XXX - pagefault_disable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - - r = copyin((caddr_t)addr + offset, data, len); + r = complete_pio(vcpu); #ifdef XXX - pagefault_enable(); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - if (r) - return (-EFAULT); + if (r) + goto out; + } + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + r = 0; + goto out; + } + } + + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); + + r = __vcpu_run(vcpu); + +out: + if (vcpu->sigset_active) + kvm_sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + vcpu_put(vcpu); + return (r); +} + +int +kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); +#ifdef CONFIG_X86_64 + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); +#endif + + regs->rip = kvm_rip_read(vcpu); + regs->rflags = kvm_get_rflags(vcpu); + + vcpu_put(vcpu); return (0); } +int +kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); +#ifdef CONFIG_X86_64 + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); +#endif + + kvm_rip_write(vcpu, regs->rip); + kvm_set_rflags(vcpu, regs->rflags); + + vcpu->arch.exception.pending = 0; + + vcpu_put(vcpu); + + return (0); +} + +void +kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + +int +kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct descriptor_table dt; + + vcpu_load(vcpu); + + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + sregs->cr0 = kvm_read_cr0(vcpu); + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = vcpu->arch.cr3; + sregs->cr4 = kvm_read_cr4(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); + sregs->efer = vcpu->arch.efer; + sregs->apic_base = kvm_get_apic_base(vcpu); + + memset(sregs->interrupt_bitmap, 0, sizeof (sregs->interrupt_bitmap)); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) { + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); + } + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); + return (0); +} + +int +kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + vcpu->arch.mp_state = mp_state->mp_state; + vcpu_put(vcpu); + return (0); +} + +static void +kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + static void -ack_flush(void *_completed) +seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); + if (seg_desc->c.b.g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->c.b.type; + kvm_desct->present = seg_desc->c.b.p; + kvm_desct->dpl = seg_desc->c.b.dpl; + kvm_desct->db = seg_desc->c.b.d; + kvm_desct->s = seg_desc->c.b.s; + kvm_desct->l = seg_desc->c.b.l; + kvm_desct->g = seg_desc->c.b.g; + kvm_desct->avl = seg_desc->c.b.avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static void +get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, uint16_t selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int +load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + int ret; + uint32_t err; + gva_t addr; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return (1); + } + + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof (*seg_desc), + vcpu, &err); + + if (ret == 1) + kvm_inject_page_fault(vcpu, addr, err); + + return (ret); +} + +/* allowed just for 8 bytes segments */ +static int +save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return (1); + + return kvm_write_guest_virt(dtable.base + index * 8, seg_desc, + sizeof (*seg_desc), vcpu, NULL); +} + +static gpa_t +get_tss_base_addr_write(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) +{ + uint32_t base_addr = get_desc_base(seg_desc); + + return (kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL)); +} + +static gpa_t +get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) +{ + uint32_t base_addr = get_desc_base(seg_desc); + + return (kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL)); +} + +static uint16_t +get_segment_selector(struct kvm_vcpu *vcpu, int seg) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + + return (kvm_seg.selector); +} + +static int +kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return (0); +} + +static int +is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } int -make_all_cpus_request(struct kvm *kvm, unsigned int req) +kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg) { - int i; - cpuset_t set; - processorid_t me, cpu; - struct kvm_vcpu *vcpu; + struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + uint8_t dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + uint32_t err_code = 0; + int null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - CPUSET_ZERO(set); + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) + return (kvm_load_realmode_segment(vcpu, selector, seg)); - mutex_enter(&kvm->requests_lock); - me = curthread->t_cpu->cpu_id; - for (i = 0; i < kvm->online_vcpus; i++) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - break; - if (test_and_set_bit(req, &vcpu->requests)) - continue; - cpu = vcpu->cpu; - if (cpu != -1 && cpu != me) - CPUSET_ADD(set, cpu); + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || + seg == VCPU_SREG_TR) && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + + if (ret) + return (ret); + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; } - if (CPUSET_ISNULL(set)) - kvm_xcall(KVM_CPUALL, ack_flush, NULL); - else { - kpreempt_disable(); - xc_sync((xc_arg_t) ack_flush, (xc_arg_t) NULL, - 0, CPUSET2BV(set), (xc_func_t) kvm_xcall_func); - kpreempt_enable(); + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; } - mutex_exit(&kvm->requests_lock); + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.c.b.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return (0); +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); return (1); + } -void -kvm_flush_remote_tlbs(struct kvm *kvm) +static void +save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) - KVM_KSTAT_INC(kvm, kvmks_remote_tlb_flush); + tss->cr3 = vcpu->arch.cr3; + tss->eip = kvm_rip_read(vcpu); + tss->eflags = kvm_get_rflags(vcpu); + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } -gfn_t -unalias_gfn(struct kvm *kvm, gfn_t gfn) +static void +kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg) { - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} - /* XXX need protection */ - aliases = kvm->arch.aliases; +static int +load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn && - gfn < alias->base_gfn + alias->npages) - return (alias->target_gfn + gfn - alias->base_gfn); + kvm_rip_write(vcpu, tss->eip); + kvm_set_rflags(vcpu, tss->eflags | 2); + + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, + tss->ldt_selector, VCPU_SREG_LDTR)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) + return (1); + + return (0); +} + +static void +save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) +{ + tss->ip = kvm_rip_read(vcpu); + tss->flag = kvm_get_rflags(vcpu); + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static int +load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) +{ + kvm_rip_write(vcpu, tss->ip); + kvm_set_rflags(vcpu, tss->flag | 2); + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return (1); + + return (0); +} + +static int +kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, + nseg_desc), &tss_segment_16.prev_task_link, + sizeof (tss_segment_16.prev_task_link))) + goto out; } - return (gfn); + + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return (ret); +} + +static int +kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, + nseg_desc), &tss_segment_32.prev_task_link, + sizeof (tss_segment_32.prev_task_link))) + goto out; + } + + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return (ret); } int -is_pse(struct kvm_vcpu *vcpu) +kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason) { - return (kvm_read_cr4_bits(vcpu, X86_CR4_PSE)); + struct kvm_segment tr_seg; + struct desc_struct cseg_desc; + struct desc_struct nseg_desc; + int ret = 0; + uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + uint32_t desc_limit; + + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); + + /* + * FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) + goto out; + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.c.b.dpl || + cpl > nseg_desc.c.b.dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return (1); + } + } + + desc_limit = get_desc_limit(&nseg_desc); + + if (!nseg_desc.c.b.p || ((desc_limit < 0x67 && + (nseg_desc.c.b.type & 8)) || desc_limit < 0x2b)) { + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); + return (1); + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.c.b.type &= ~(1 << 1); // clear the B flag + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + /* + * set back link to prev task only if NT bit is set in eflags + * note that old_tss_sel is not used afetr this point + */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (nseg_desc.c.b.type & 8) { + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + } else { + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + } + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.c.b.type |= (1 << 1); + save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + return (ret); +} + +static unsigned long +find_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + (offset/64); + unsigned long result = offset & ~(64-1); + unsigned long tmp; + + if (offset >= size) + return (size); + + size -= result; + offset %= 64; + + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~(64-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + + if (!size) + return (result); + tmp = *p; + +found_first: + tmp &= (~0UL >> (64 - size)); + if (tmp == 0UL) /* Are any bits set? */ + return (result + size); /* Nope. */ +found_middle: + return (result + __ffs(tmp)); +} + +int +kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int mmu_reset_needed = 0; + int pending_vec, max_bits; + struct descriptor_table dt; + + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + + kvm_set_cr8(vcpu, sregs->cr8); + + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_set_apic_base(vcpu, sregs->apic_base); + + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + vcpu->arch.cr0 = sregs->cr0; + + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + + if (!is_long_mode(vcpu) && is_pae(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.cr3); + mmu_reset_needed = 1; + } + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + max_bits = (sizeof (sregs->interrupt_bitmap)) << 3; + pending_vec = + find_next_bit((const unsigned long *)sregs->interrupt_bitmap, + max_bits, 0); + + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, 0); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + } + + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + update_cr8_intercept(vcpu); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + /* Older userspace won't unhalt the vcpu on reset. */ + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !is_protmode(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +#endif /* CONFIG_KVM_APIC_ARCHITECTURE */ + + vcpu_put(vcpu); + + return (0); +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + * + * 8*16 bytes for each FP-reg = 128 bytes + * 16*16 bytes for each XMM-reg=256 bytes + */ +typedef struct fxsave { + uint16_t cwd; + uint16_t swd; + uint16_t twd; + uint16_t fop; + uint64_t rip; + uint64_t rdp; + uint32_t mxcsr; + uint32_t mxcsr_mask; + uint32_t st_space[32]; +#ifdef CONFIG_X86_64 + uint32_t xmm_space[64]; +#else + uint32_t xmm_space[32]; +#endif +} fxsave_t; + +int +kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof (fxsave->xmm_space)); + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof (fxsave->xmm_space)); + + vcpu_put(vcpu); + + return (0); } void -kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn) +fx_init(struct kvm_vcpu *vcpu) { - if (!kvm_is_mmio_pfn(pfn)) - get_page(pfn_to_page(pfn)); + unsigned after_mxcsr_mask; +#ifdef XXX + /* + * Touch the fpu the first time in non atomic context as if + * this is the first fpu instruction the exception handler + * will fire before the instruction returns and it'll have to + * allocate ram with GFP_KERNEL. + */ + if (!used_math()) +#else + XXX_KVM_PROBE; +#endif + kvm_fx_save(&vcpu->arch.host_fx_image); + + /* Initialize guest FPU by resetting ours and saving into guest's */ + kpreempt_disable(); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + kpreempt_enable(); + + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image + + after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) - + after_mxcsr_mask); +} + +void +kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 1; + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); + KVM_TRACE1(fpu, int, 1); +} + +void +kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 0; + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_fpu_reload); + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + KVM_TRACE1(fpu, int, 0); +} + +void +kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + /* XXX We aren't doing anything with the time page */ + XXX_KVM_PROBE; + vcpu->arch.time_page = NULL; + } + + if (vcpu->kvcpu_kstat != NULL) + kstat_delete(vcpu->kvcpu_kstat); + + kvm_x86_ops->vcpu_free(vcpu); +} + +struct kvm_vcpu * +kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +{ + char buf[32]; + struct kvm_vcpu *vcpu; + kstat_t *kstat; + + (void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid); + + if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED, + sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) == NULL) { + return (NULL); + } + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + if (vcpu == NULL) { + kstat_delete(kstat); + return (NULL); + } + + vcpu->kvcpu_kstat = kstat; + vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id"); + vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid"); + vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail, + "inst-emulation-fail"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls"); + + kstat_install(vcpu->kvcpu_kstat); + + return (vcpu); } int @@ -1016,94 +4825,243 @@ free_vcpu: return (r); } -void -kvm_get_kvm(struct kvm *kvm) +int +kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - atomic_inc_32(&kvm->users_count); + vcpu->arch.nmi_pending = 0; + vcpu->arch.nmi_injected = 0; + + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + + return (kvm_x86_ops->vcpu_reset(vcpu)); } -/* - * Creates some virtual cpus. Good luck creating more than one. - */ int -kvm_vm_ioctl_create_vcpu(struct kvm *kvm, uint32_t id, int *rval_p) +kvm_arch_hardware_enable(void *garbage) { - int r, i; - struct kvm_vcpu *vcpu, *v; - - vcpu = kvm_arch_vcpu_create(kvm, id); - if (vcpu == NULL) - return (EINVAL); - #ifdef XXX - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + /* + * Since this may be called from a hotplug notifcation, + * we can't get the CPU frequency directly. + */ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + int cpu = raw_smp_processor_id(); + per_cpu(cpu_tsc_khz, cpu) = 0; + } #else XXX_KVM_PROBE; #endif + kvm_shared_msr_cpu_online(); - r = kvm_arch_vcpu_setup(vcpu); - if (r) - return (r); - - mutex_enter(&kvm->lock); + return (kvm_x86_ops->hardware_enable(garbage)); +} -#ifdef XXX - if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { -#else - XXX_KVM_SYNC_PROBE; - if (kvm->online_vcpus == KVM_MAX_VCPUS) { +void +kvm_arch_hardware_disable(void *garbage) +{ + kvm_x86_ops->hardware_disable(garbage); +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + drop_user_return_notifiers(garbage); #endif - r = EINVAL; - goto vcpu_destroy; +} + +int +kvm_arch_hardware_setup(void) +{ + return (kvm_x86_ops->hardware_setup()); +} + +void +kvm_arch_check_processor_compat(void *rtn) +{ + kvm_x86_ops->check_processor_compatibility(rtn); +} + +int +kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + page_t *page; + struct kvm *kvm; + int r; + + kvm = vcpu->kvm; + + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + /* + * page = alloc_page(PAGESIZE, KM_SLEEP); + * if (!page) { + * r = ENOMEM; + * goto fail; + * } + * vcpu->arch.pio_data = page_address(page); + */ + vcpu->arch.pio_data = (caddr_t)vcpu->run + + (KVM_PIO_PAGE_OFFSET * PAGESIZE); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail; + + if (irqchip_in_kernel(kvm)) { + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; } - /* kvm_for_each_vcpu(r, v, kvm) */ - for (i = 0; i < kvm->online_vcpus; i++) { - v = kvm->vcpus[i]; - if (v->vcpu_id == id) { - r = -EEXIST; - goto vcpu_destroy; - } + vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * + sizeof (uint64_t) * 4, KM_SLEEP); + + if (!vcpu->arch.mce_banks) { + r = ENOMEM; + goto fail_free_lapic; + } + + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + return (0); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); +fail: + return (r); +} + +void +kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 * + KVM_MAX_MCE_BANKS); + kvm_free_lapic(vcpu); + kvm_mmu_destroy(vcpu); +} + +struct kvm * +kvm_arch_create_vm(void) +{ + struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP); + + if (!kvm) + return (NULL); + + if ((kvm->arch.aliases = + kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) { + kmem_free(kvm, sizeof (struct kvm)); + return (NULL); } - /* BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); */ + list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page), + offsetof(struct kvm_mmu_page, link)); - /* Now it's all set up, let userspace reach it */ - kvm_get_kvm(kvm); + list_create(&kvm->arch.assigned_dev_head, + sizeof (struct kvm_assigned_dev_kernel), + offsetof(struct kvm_assigned_dev_kernel, list)); - *rval_p = kvm->online_vcpus; /* guarantee unique id */ - vcpu->vcpu_id = *rval_p; + /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ + set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* XXX need to protect online_vcpus */ - kvm->vcpus[kvm->online_vcpus] = vcpu; + /* XXX - original is rdtscll() */ + kvm->arch.vm_init_tsc = (uint64_t)gethrtime(); -#ifdef XXX - smp_wmb(); -#else + return (kvm); +} + +static void +kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + +static void +kvm_free_vcpus(struct kvm *kvmp) +{ + int ii, maxcpus; + + maxcpus = kvmp->online_vcpus; XXX_KVM_SYNC_PROBE; -#endif - atomic_inc_32(&kvm->online_vcpus); + for (ii = 0; ii < maxcpus; ii++) + kvm_unload_vcpu_mmu(kvmp->vcpus[ii]); -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - if (kvm->bsp_vcpu_id == id) - kvm->bsp_vcpu = vcpu; -#endif + for (ii = 0; ii < maxcpus; ii++) + kvm_arch_vcpu_free(kvmp->vcpus[ii]); - mutex_exit(&kvm->lock); - return (r); + mutex_enter(&kvmp->lock); + for (ii = 0; ii < maxcpus; ii++) + kvmp->vcpus[ii] = NULL; + kvmp->online_vcpus = 0; + mutex_exit(&kvmp->lock); +} + +/* + * This function exists because of a difference in methodologies from our + * ancestor. With our ancestors, there is no imputus to clean up lists and + * mutexes. This is unfortunate, because they seem to even have debug kernels + * which would seemingly check for these kinds of things. But because in the + * common case mutex_exit is currently a #define to do {} while(0), it seems + * that they just ignore this. + * + * This leads to the following behavior: during our time we create a lot of + * auxillary structs potentially related to pits, apics, etc. Tearing down these + * structures relies on having the correct locks, etc. However + * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing + * the kmem_free. Logically these auxillary structures need to be freed and + * dealt with before we go back and do the rest of the tear down related to the + * device. + */ +void +kvm_arch_destroy_vm_comps(struct kvm *kvmp) +{ + if (kvmp == NULL) -vcpu_destroy: +#ifdef IOMMU + kvm_iommu_unmap_guest(kvmp); +#else + XXX_KVM_PROBE; +#endif /* IOMMU */ + kvm_free_pit(kvmp); + kvm_free_vcpus(kvmp); + kvm_free_physmem(kvmp); #ifdef XXX - mutex_exit(&kvm->lock); - kvm_arch_vcpu_destroy(vcpu); +#ifdef APIC + if (kvm->arch.apic_access_page) + put_page(kvm->arch.apic_access_page); + if (kvm->arch.ept_identity_pagetable) + put_page(kvm->arch.ept_identity_pagetable); +#endif /* APIC */ #else XXX_KVM_PROBE; -#endif - return (r); +#endif /* XXX */ +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + cleanup_srcu_struct(&kvm->srcu); +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ } +void +kvm_arch_destroy_vm(struct kvm *kvmp) +{ + if (kvmp == NULL) + return; /* nothing to do here */ -int kvm_arch_prepare_memory_region(struct kvm *kvm, + if (kvmp->arch.aliases) { + kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases)); + kvmp->arch.aliases = NULL; + } + kmem_free(kvmp, sizeof (struct kvm)); +} + +int +kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, struct kvm_userspace_memory_region *mem, int user_alloc) { @@ -1162,171 +5120,154 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return (0); } -/* - * Allocate some memory and give it an address in the guest physical address - * space. - * - * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding mmap_sem for write. - */ -int -kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, int user_alloc) +void +kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, + int user_alloc) { - if (mem->slot >= KVM_MEMORY_SLOTS) - return (EINVAL); - return (kvm_set_memory_region(kvm, mem, user_alloc)); -} + int npages = mem->memory_size >> PAGESHIFT; + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret = 0; -/* Caller must hold slots_lock. */ -int -kvm_io_bus_register_dev(struct kvm *kvm, - enum kvm_bus bus_idx, struct kvm_io_device *dev) -{ - struct kvm_io_bus *new_bus, *bus; +#ifdef XXX + down_write(¤t->mm->mmap_sem); + ret = munmap(old.userspace_addr, + old.npages * PAGESIZE); + up_write(¤t->mm->mmap_sem); +#else + XXX_KVM_PROBE; + /* see comment in kvm_arch_prepare_memory_region */ + /* + * XXX this needs to be here, but I'm getting kernel heap + * corruption panics with someone writing to a buffer after it + * is freed + */ + kmem_free((caddr_t)old.userspace_addr, old.npages * PAGESIZE); +#endif + if (ret < 0) { + cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + } - bus = kvm->buses[bus_idx]; - if (bus->dev_count > NR_IOBUS_DEVS-1) - return (-ENOSPC); + mutex_enter(&kvm->mmu_lock); + if (!kvm->arch.n_requested_mmu_pages) { + unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } + + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + mutex_exit(&kvm->mmu_lock); +} - new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); - if (!new_bus) - return (-ENOMEM); - memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); - new_bus->devs[new_bus->dev_count++] = dev; +void +kvm_arch_flush_shadow(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); #ifdef XXX - rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); + kvm_reload_remote_mmus(kvm); #else XXX_KVM_PROBE; - kvm->buses[bus_idx] = new_bus; #endif - if (bus) - kmem_free(bus, sizeof (struct kvm_io_bus)); - - return (0); } -/* Caller must hold slots_lock. */ int -kvm_io_bus_unregister_dev(struct kvm *kvm, - enum kvm_bus bus_idx, struct kvm_io_device *dev) +kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - int i, r; - struct kvm_io_bus *new_bus, *bus; + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || + vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || + vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))); +} - new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); - if (!new_bus) - return (-ENOMEM); +void +kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + processorid_t cpu = vcpu->cpu; - bus = kvm->buses[bus_idx]; - memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); + mutex_enter(&vcpu->kvcpu_kick_lock); - r = -ENOENT; - for (i = 0; i < new_bus->dev_count; i++) { - if (new_bus->devs[i] == dev) { - r = 0; - new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; - break; - } - } + if (CV_HAS_WAITERS(&vcpu->kvcpu_kick_cv)) + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_wakeup); - if (r) { - kmem_free(new_bus, sizeof (struct kvm_io_bus)); - return (r); - } + cv_broadcast(&vcpu->kvcpu_kick_cv); + mutex_exit(&vcpu->kvcpu_kick_lock); -#ifdef XXX - rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; - kvm->buses[bus_idx] = new_bus; -#endif - kmem_free(bus, sizeof (struct kvm_io_bus)); - return (r); + if (cpu != CPU->cpu_id && cpu != -1) { + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) { + /* + * If we haven't already kicked this VCPU, we'll poke + * the the CPU on which it's running. (This will serve + * to induce a VM exit.) + */ + poke_cpu(cpu); + } + } } -long -kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) +int +kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - void *argp = (void *)arg; - int r; - proc_t *p; + return (kvm_x86_ops->interrupt_allowed(vcpu)); +} - if (kvmp->mm != curproc->p_as) - return (EIO); +unsigned long +kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; - switch (ioctl) { -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - case KVM_REGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; - r = EFAULT; - if (copyin(argp, &zone, sizeof (zone))) - goto out; - r = ENXIO; - r = kvm_vm_ioctl_register_coalesced_mmio(kvmp, &zone); - if (r) - goto out; - r = 0; - break; - } - case KVM_UNREGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; - r = EFAULT; - if (copyin(argp, &zone, sizeof (zone))) - goto out; - r = ENXIO; - r = kvm_vm_ioctl_unregister_coalesced_mmio(kvmp, &zone); - if (r) - goto out; - r = 0; - break; - } -#endif -#ifdef XXX_KVM_DECLARATION - case KVM_IRQFD: { - struct kvm_irqfd data; + rflags = kvm_x86_ops->get_rflags(vcpu); - if (ddi_copyin(argp, &data, sizeof (data), mode)) - return (EFAULT); - r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags); - break; - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); - case KVM_IOEVENTFD: { - struct kvm_ioeventfd data; + return (rflags); +} - r = -EFAULT; - if (copy_from_user(&data, argp, sizeof (data))) - goto out; - r = kvm_ioeventfd(kvmp, &data); - break; +void +kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + vcpu->arch.singlestep_cs == get_segment_selector(vcpu, + VCPU_SREG_CS) && vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) { + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; } -#endif - default: - return (EINVAL); - } + kvm_x86_ops->set_rflags(vcpu, rflags); +} -out: - return (r); +inline gpa_t +gfn_to_gpa(gfn_t gfn) +{ + return ((gpa_t)gfn << PAGESHIFT); } -int -kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +/* + * For pages for which vmx needs physical addresses, + * linux allocates pages from an area that maps virtual + * addresses 1-1 with physical memory. In this way, + * translating virtual to physical just involves subtracting + * the start of the area from the virtual address. + * This solaris version uses kmem_alloc, so there is no + * direct mapping of virtual to physical. We'll change this + * later if performance is an issue. For now, we'll use + * hat_getpfnum() to do the conversion. Also note that + * we're assuming 64-bit address space (we won't run on + * 32-bit hardware). + */ +uint64_t +kvm_va2pa(caddr_t va) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || - vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || - vcpu->arch.nmi_pending || - (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))); + uint64_t pa; + + pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET); + return (pa); } void -kvm_reload_remote_mmus(struct kvm *kvm) +kvm_migrate_timers(struct kvm_vcpu *vcpu) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); } |