diff options
-rw-r--r-- | kvm.c | 6019 | ||||
-rw-r--r-- | kvm_host.h | 87 | ||||
-rw-r--r-- | kvm_mmu.c | 38 | ||||
-rw-r--r-- | kvm_mmu.h | 1 | ||||
-rw-r--r-- | kvm_subr.c | 165 | ||||
-rw-r--r-- | kvm_vmx.c | 32 | ||||
-rw-r--r-- | kvm_x86.c | 5419 | ||||
-rw-r--r-- | kvm_x86host.h | 5 | ||||
-rw-r--r-- | kvm_x86impl.h | 59 |
9 files changed, 5811 insertions, 6014 deletions
@@ -49,6 +49,7 @@ #include "kvm_apicdef.h" #include "kvm_iodev.h" #include "kvm.h" +#include "kvm_x86impl.h" #include "kvm_irq.h" #include "kvm_tss.h" #include "kvm_ioapic.h" @@ -56,9 +57,6 @@ #include "kvm_i8254.h" #include "kvm_mmu.h" #include "kvm_cache_regs.h" -#include "kvm_x86impl.h" -#include "kvm_lapic.h" -#include "kvm_vmx.h" #undef DEBUG @@ -66,11 +64,17 @@ * The entire state of the kvm device. */ typedef struct { - struct kvm *kds_kvmp; /* pointer to underlying VM */ - struct kvm_vcpu *kds_vcpu; /* pointer to VCPU */ + struct kvm *kds_kvmp; /* pointer to underlying VM */ + struct kvm_vcpu *kds_vcpu; /* pointer to VCPU */ } kvm_devstate_t; /* + * Globals + */ +page_t *bad_page; +pfn_t bad_pfn; + +/* * Tunables */ static int kvm_hiwat = 0x1000000; @@ -82,9 +86,7 @@ static void *kvm_state; /* DDI state */ static vmem_t *kvm_minor; /* minor number arena */ static dev_info_t *kvm_dip; /* global devinfo hanlde */ static minor_t kvm_base_minor; /* The only minor device that can be opened */ - -static int kvmid; /* monotonically increasing, unique per vm */ - +static int kvmid; /* monotonically increasing, unique per vm */ static int largepages_enabled = 1; static cpuset_t cpus_hardware_enabled; static volatile uint32_t hardware_enable_failed; @@ -92,817 +94,254 @@ static int kvm_usage_count; static list_t vm_list; static kmutex_t kvm_lock; static int ignore_msrs = 0; - -/* - * Driver forward declarations - */ -static int kvm_open(dev_t *devp, int flag, int otyp, cred_t *cred); -static int kvm_close(dev_t dev, int flag, int otyp, cred_t *cred); -static int kvm_read(dev_t dev, struct uio *uiop, cred_t *credp); -static int kvm_write(dev_t dev, struct uio *uiop, cred_t *credp); -static int kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md, - cred_t *cred_p, int *rv); -static int kvm_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, - size_t len, size_t *maplen, uint_t model); -static int kvm_segmap(dev_t, off_t, struct as *, caddr_t *, off_t, - unsigned int, unsigned int, unsigned int, cred_t *); -static int kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, - void **result); -static int kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); -static int kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); - -static struct cb_ops kvm_cb_ops = { - kvm_open, - kvm_close, /* close */ - nodev, - nodev, - nodev, /* dump */ - nodev, /* read */ - nodev, /* write */ - kvm_ioctl, - kvm_devmap, - nodev, /* mmap */ - kvm_segmap, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, - NULL, - D_NEW | D_MP | D_DEVMAP -}; -static struct dev_ops kvm_ops = { - DEVO_REV, - 0, - kvm_getinfo, - nulldev, /* identify */ - nulldev, /* probe */ - kvm_attach, - kvm_detach, - nodev, /* reset */ - &kvm_cb_ops, - (struct bus_ops *)0 -}; - -static struct modldrv modldrv = { - &mod_driverops, - "kvm driver v0.1", - &kvm_ops -}; - -static struct modlinkage modlinkage = { - MODREV_1, - { &modldrv, NULL } -}; - -/* XXX */ -static int hardware_enable_all(void); -static void hardware_disable_all(void); -static void kvm_destroy_vm(struct kvm *); -static int kvm_avlmmucmp(const void *, const void *); -extern struct kvm_x86_ops vmx_x86_ops; -extern struct kvm_shared_msrs **shared_msrs; -struct kvm_shared_msrs_global shared_msrs_global; -static void kvm_on_user_return(struct kvm_vcpu *, - struct kvm_user_return_notifier *); -page_t *bad_page; -pfn_t bad_pfn; -struct kvm_x86_ops *kvm_x86_ops; - -inline int -kvm_exception_is_soft(unsigned int nr) -{ - return (nr == BP_VECTOR) || (nr == OF_VECTOR); -} - -/* - * EFER defaults: - * - enable syscall per default because its emulated by KVM - * - enable LME and LMA per default on 64 bit KVM - */ -#ifdef CONFIG_X86_64 -static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL; -#else -static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL; -#endif +static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)]; void -kvm_enable_efer_bits(uint64_t mask) +kvm_user_return_notifier_register(struct kvm_vcpu *vcpu, + struct kvm_user_return_notifier *urn) { - efer_reserved_bits &= ~mask; + vcpu->urn = urn; } void -kvm_disable_largepages(void) +kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu, + struct kvm_user_return_notifier *urn) { - largepages_enabled = 0; + vcpu->urn = NULL; } -int -kvm_arch_hardware_setup(void) +void +kvm_fire_urn(struct kvm_vcpu *vcpu) { - return (kvm_x86_ops->hardware_setup()); + if (vcpu->urn) + vcpu->urn->on_user_return(vcpu, vcpu->urn); } +/* + * Called when we've been asked to save our context. i.e. we're being swapped + * out. + */ void -bitmap_zero(unsigned long *dst, int nbits) +kvm_ctx_save(void *arg) { - int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long); - memset(dst, 0, len); + struct kvm_vcpu *vcpu = arg; + kvm_arch_vcpu_put(vcpu); + kvm_fire_urn(vcpu); } -struct kvm_mmu_page * -page_private(kvm_t *kvmp, page_t *page) +/* + * Called when we're being asked to restore our context. i.e. we're returning + * from being swapped out. + */ +void +kvm_ctx_restore(void *arg) { - kvm_mmu_page_t mp, *res; - mp.kmp_avlspt = (uintptr_t)page; - mutex_enter(&kvmp->kvm_avllock); - res = avl_find(&kvmp->kvm_avlmp, &mp, NULL); - mutex_exit(&kvmp->kvm_avllock); - ASSERT(res != NULL); - return (res); -} + int cpu; -inline struct kvm_mmu_page * -page_header(kvm_t *kvmp, hpa_t shadow_page) -{ - return (page_private(kvmp, pfn_to_page(shadow_page >> PAGESHIFT))); + cpu = CPU->cpu_seqid; + struct kvm_vcpu *vcpu = arg; + kvm_arch_vcpu_load(vcpu, cpu); } -struct kvm_memory_slot * -gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) -{ - int i; #ifdef XXX_KVM_DECLARATION - struct kvm_memslots *slots = rcu_dereference(kvm->memslots); +#define pfn_valid(pfn) ((pfn < physmax) && (pfn != PFN_INVALID)) #else - struct kvm_memslots *slots = kvm->memslots; +#define pfn_valid(pfn) (pfn != PFN_INVALID) #endif - for (i = 0; i < slots->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &slots->memslots[i]; - - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) - return (memslot); - } - return (NULL); -} - -gfn_t -unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +inline int +kvm_is_mmio_pfn(pfn_t pfn) { - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; + if (pfn_valid(pfn)) { #ifdef XXX - aliases = rcu_dereference(kvm->arch.aliases); -#else - XXX_KVM_SYNC_PROBE; - aliases = kvm->arch.aliases; -#endif - - for (i = 0; i < aliases->naliases; i++) { - alias = &aliases->aliases[i]; - if (alias->flags & KVM_ALIAS_INVALID) - continue; - if (gfn >= alias->base_gfn && - gfn < alias->base_gfn + alias->npages) - return (alias->target_gfn + gfn - alias->base_gfn); - } - - return (gfn); -} - -int -kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; -#ifdef XXX_KVM_DECLARATION - struct kvm_memslots *slots = rcu_dereference(kvm->memslots); + struct page *page = compound_head(pfn_to_page(pfn)); + return (PageReserved(page)); #else - struct kvm_memslots *slots = kvm->memslots; + XXX_KVM_PROBE; #endif - - gfn = unalias_gfn_instantiation(kvm, gfn); - - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *memslot = &slots->memslots[i]; - - if (memslot->flags & KVM_MEMSLOT_INVALID) - continue; - - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) { - return (1); - } - } - - return (0); + return (0); + } else + return (1); } /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * This list is modified at module load time to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in the beginning of the list. + * Switches to specified vcpu, until a matching vcpu_put() */ - -#define MSR_KVM_WALL_CLOCK 0x11 -#define MSR_KVM_SYSTEM_TIME 0x12 - -#define KVM_SAVE_MSRS_BEGIN 5 -static uint32_t msrs_to_save[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA -}; - -static unsigned num_msrs_to_save; - -static uint32_t emulated_msrs[] = { - MSR_IA32_MISC_ENABLE, -}; - -uint64_t -native_read_msr_safe(unsigned int msr, int *err) -{ - DECLARE_ARGS(val, low, high); - uint64_t ret = 0; - on_trap_data_t otd; - - if (on_trap(&otd, OT_DATA_ACCESS) == 0) { - ret = native_read_msr(msr); - *err = 0; - } else { - *err = EINVAL; /* XXX probably not right... */ - } - no_trap(); - - return (ret); -} - -/* Can be uninlined because referenced by paravirt */ -int -native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +void +vcpu_load(struct kvm_vcpu *vcpu) { - int err = 0; - on_trap_data_t otd; - - if (on_trap(&otd, OT_DATA_ACCESS) == 0) { - native_write_msr(msr, low, high); - } else { - err = EINVAL; /* XXX probably not right... */ - } - no_trap(); + int cpu; - return (err); + mutex_enter(&vcpu->mutex); + kpreempt_disable(); + cpu = CPU->cpu_seqid; + installctx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL, + NULL, NULL, NULL); + kvm_arch_vcpu_load(vcpu, cpu); + kpreempt_enable(); } -static void -kvm_init_msr_list(void) +void +vcpu_put(struct kvm_vcpu *vcpu) { - uint32_t dummy[2]; - unsigned i, j; - - /* skip the first msrs in the list. KVM-specific */ - for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) - continue; - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; - } - num_msrs_to_save = j; + kpreempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_fire_urn(vcpu); + removectx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL, + NULL, NULL, NULL); + kpreempt_enable(); + mutex_exit(&vcpu->mutex); } -uint64_t cpu_tsc_khz; -extern uint64_t cpu_freq_hz; - static void -kvm_timer_init(void) +ack_flush(void *_completed) { - int cpu; - - /* - * XXX We assume that any machine running solaris kvm - * has constant time stamp counter increment rate. - * This will be true for all but older machines. - */ - /* assume pi_clock in mhz */ - cpu_tsc_khz = (cpu_freq_hz / 1000); } int -kvm_arch_init(void *opaque) +make_all_cpus_request(struct kvm *kvm, unsigned int req) { - int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + int i; + cpuset_t set; + processorid_t me, cpu; + struct kvm_vcpu *vcpu; - if (ops->cpu_has_kvm_support()) { - cmn_err(CE_WARN, "kvm: no hardware support\n"); - r = ENOTSUP; - goto out; + CPUSET_ZERO(set); + + mutex_enter(&kvm->requests_lock); + me = curthread->t_cpu->cpu_id; + for (i = 0; i < kvm->online_vcpus; i++) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + break; + if (test_and_set_bit(req, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != me) + CPUSET_ADD(set, cpu); } - if (ops->disabled_by_bios()) { - cmn_err(CE_WARN, "kvm: disabled by bios\n"); - r = ENOTSUP; - goto out; + if (CPUSET_ISNULL(set)) + kvm_xcall(KVM_CPUALL, ack_flush, NULL); + else { + kpreempt_disable(); + xc_sync((xc_arg_t) ack_flush, (xc_arg_t) NULL, + 0, CPUSET2BV(set), (xc_func_t) kvm_xcall_func); + kpreempt_enable(); } + mutex_exit(&kvm->requests_lock); - r = kvm_mmu_module_init(); - if (r) - goto out; - - kvm_init_msr_list(); - - kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); - kvm_mmu_set_base_ptes(PT_PRESENT_MASK); - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - - kvm_timer_init(); - - return (0); - -out: - return (r); + return (1); } -page_t * -alloc_page(size_t size, int flag) +void +kvm_flush_remote_tlbs(struct kvm *kvm) { - caddr_t page_addr; - pfn_t pfn; - page_t *pp; - - if ((page_addr = kmem_zalloc(size, flag)) == NULL) - return ((page_t *)NULL); - - pp = page_numtopp_nolock(hat_getpfnum(kas.a_hat, page_addr)); - return (pp); + if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + KVM_KSTAT_INC(kvm, kvmks_remote_tlb_flush); } void -kvm_arch_check_processor_compat(void *rtn) +kvm_reload_remote_mmus(struct kvm *kvm) { - kvm_x86_ops->check_processor_compatibility(rtn); + make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } int -kvm_init(void *opaque) +kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { int r; - int cpu; - - r = kvm_arch_init(opaque); - - if (r != DDI_SUCCESS) - return (r); - - bad_page = alloc_page(PAGESIZE, KM_SLEEP); - bad_pfn = bad_page->p_pagenum; - -#ifdef XXX - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto out_free_0; - } -#else - XXX_KVM_PROBE; -#endif - r = kvm_arch_hardware_setup(); - - if (r != DDI_SUCCESS) - goto out_free_0a; - -#ifdef XXX - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_arch_check_processor_compat, - &r, 1); - if (r < 0) - goto out_free_1; - } -#else - r = 0; - kvm_xcall(KVM_CPUALL, kvm_arch_check_processor_compat, &r); - if (r < 0) - goto out_free_1; - XXX_KVM_PROBE; -#endif - + mutex_init(&vcpu->mutex, NULL, MUTEX_DRIVER, 0); + vcpu->cpu = -1; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; #ifdef XXX - r = register_cpu_notifier(&kvm_cpu_notifier); - if (r) - goto out_free_2; - register_reboot_notifier(&kvm_reboot_notifier); - - r = sysdev_class_register(&kvm_sysdev_class); - if (r) - goto out_free_3; - - r = sysdev_register(&kvm_sysdev); - if (r) - goto out_free_4; + init_waitqueue_head(&vcpu->wq); #else XXX_KVM_PROBE; #endif + vcpu->run = ddi_umem_alloc(PAGESIZE * 2, DDI_UMEM_SLEEP, &vcpu->cookie); -#ifdef XXX - kvm_chardev_ops.owner = module; - kvm_vm_fops.owner = module; - kvm_vcpu_fops.owner = module; + r = kvm_arch_vcpu_init(vcpu); - r = misc_register(&kvm_dev); - if (r) { - cmn_err(CE_WARN, "kvm: misc device register failed\n"); - goto out_free; + if (r != 0) { + vcpu->run = NULL; + ddi_umem_free(vcpu->cookie); + return (r); } - /* - * XXX - if kernel preemption occurs, we probably need - * to implement these, and add hooks to the preemption code. - * For right now, we'll make the totally unreasonable - * assumption that we won't be preempted while in the - * kernel, i.e., no realtime threads are running - */ - kvm_preempt_ops.sched_in = kvm_sched_in; - kvm_preempt_ops.sched_out = kvm_sched_out; - - kvm_init_debug(); -#else - XXX_KVM_PROBE; -#endif - return (0); - -out_free: -out_free_5: -#ifdef XXX - sysdev_unregister(&kvm_sysdev); -out_free_4: - sysdev_class_unregister(&kvm_sysdev_class); -out_free_3: - unregister_reboot_notifier(&kvm_reboot_notifier); - unregister_cpu_notifier(&kvm_cpu_notifier); -#else - XXX_KVM_PROBE; -#endif -out_free_2: -out_free_1: -#ifdef XXX - kvm_arch_hardware_unsetup(); -#else - XXX_KVM_PROBE; -#endif -out_free_0a: -#ifdef XXX - free_cpumask_var(cpus_hardware_enabled); -#else - XXX_KVM_PROBE; -#endif -out_free_0: -#ifdef XXX - free_page(bad_page, PAGESIZE); -#else - XXX_KVM_PROBE; -#endif -out: -#ifdef XXX - kvm_arch_exit(); -#else - XXX_KVM_PROBE; -#endif -out_fail: - return (r); } void -kvm_define_shared_msr(unsigned slot, uint32_t msr) -{ - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot] = msr; -#ifdef XXX - /* we need ensured the shared_msr_global have been updated */ - smp_wmb(); -#else - XXX_KVM_SYNC_PROBE; -#endif -} - -int -_init(void) -{ - - return (mod_install(&modlinkage)); -} - -int -_fini(void) -{ - return (mod_remove(&modlinkage)); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} - -static int -kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - minor_t instance; - - if (kpm_enable == 0) { - cmn_err(CE_WARN, "kvm: kpm_enable must be true\n"); - return (DDI_FAILURE); - } - - - if (cmd != DDI_ATTACH) - return (DDI_FAILURE); - - if (kvm_dip != NULL) - return (DDI_FAILURE); - - if (ddi_soft_state_init(&kvm_state, sizeof (kvm_devstate_t), 1) != 0) - return (DDI_FAILURE); - - instance = ddi_get_instance(dip); - if (ddi_create_minor_node(dip, "kvm", - S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_fini(&kvm_state); - return (DDI_FAILURE); - } - - mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0); - kvm_x86_ops = &vmx_x86_ops; - if (vmx_init() != DDI_SUCCESS) { - ddi_soft_state_fini(&kvm_state); - ddi_remove_minor_node(dip, NULL); - mutex_destroy(&kvm_lock); - return (DDI_FAILURE); - } - - if (hardware_enable_all() != 0) { - ddi_soft_state_fini(&kvm_state); - ddi_remove_minor_node(dip, NULL); - mutex_destroy(&kvm_lock); - vmx_fini(); - return (DDI_FAILURE); - } - - kvm_dip = dip; - kvm_base_minor = instance; - - list_create(&vm_list, sizeof (struct kvm), - offsetof(struct kvm, vm_list)); - kvm_minor = vmem_create("kvm_minor", (void *)1, UINT32_MAX - 1, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); - - ddi_report_dev(dip); - - return (DDI_SUCCESS); -} - -static int -kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - int instance; - - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - VERIFY(kvm_dip != NULL && kvm_dip == dip); - instance = ddi_get_instance(dip); - VERIFY(instance == kvm_base_minor); - ddi_prop_remove_all(dip); - ddi_remove_minor_node(dip, NULL); - list_destroy(&vm_list); - vmem_destroy(kvm_minor); - kvm_dip = NULL; - - hardware_disable_all(); - mutex_destroy(&kvm_lock); - ddi_soft_state_fini(&kvm_state); - vmx_fini(); - - return (DDI_SUCCESS); -} - -/*ARGSUSED*/ -static int -kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) -{ - kvm_devstate_t *rsp; - int error = DDI_FAILURE; - - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *result = kvm_dip; - break; - - case DDI_INFO_DEVT2INSTANCE: - *result = (void *)((uint64_t)getminor((dev_t)arg)); - error = DDI_SUCCESS; - break; - - default: - break; - } - - return (error); -} - -/*ARGSUSED*/ -static int -kvm_open(dev_t *devp, int flag, int otype, cred_t *credp) +kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { - minor_t minor; - kvm_devstate_t *ksp; - - if (flag & FEXCL || flag & FNDELAY) - return (EINVAL); - - if (otype != OTYP_CHR) - return (EINVAL); - - /* - * XXX This should be its own privilage - */ - if (drv_priv(credp) != 0) - return (EPERM); - - if (!(flag & FREAD && flag & FWRITE)) - return (EINVAL); - - if (getminor(*devp) != kvm_base_minor) - return (ENXIO); - - minor = (minor_t)(uintptr_t)vmem_alloc(kvm_minor, - 1, VM_BESTFIT | VM_SLEEP); - - if (ddi_soft_state_zalloc(kvm_state, minor) != 0) { - vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1); - return (ENXIO); - } - - *devp = makedevice(getmajor(*devp), minor); - ksp = ddi_get_soft_state(kvm_state, minor); - VERIFY(ksp != NULL); - - return (0); + kvm_arch_vcpu_uninit(vcpu); + ddi_umem_free(vcpu->cookie); } -/*ARGSUSED*/ -static int -kvm_close(dev_t dev, int flag, int otyp, cred_t *cred) +/* + * Note if we want to implement the kvm mmu notifier components than the + * following two functions will need to be readdressed. + */ +static int kvm_init_mmu_notifier(struct kvm *kvm) { - kvm_devstate_t *ksp; - minor_t minor = getminor(dev); - kvm_t *kvmp; - - VERIFY(getminor(dev) != kvm_base_minor); - ksp = ddi_get_soft_state(kvm_state, minor); - - if ((kvmp = ksp->kds_kvmp) != NULL) { - mutex_enter(&kvm_lock); - - if (kvmp->kvm_clones > 0) { - kvmp->kvm_clones--; - mutex_exit(&kvm_lock); - } else { - mutex_exit(&kvm_lock); - kvm_destroy_vm(kvmp); - } - } - - ddi_soft_state_free(kvm_state, minor); - vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1); - return (0); } static void -hardware_enable(void *junk) +kvm_fini_mmu_notifier(struct kvm *kvm) { - int cpu; - int r; - - cpu = curthread->t_cpu->cpu_id; - - if (CPU_IN_SET(cpus_hardware_enabled, cpu)) - return; - - CPUSET_ADD(cpus_hardware_enabled, cpu); - - r = kvm_arch_hardware_enable(NULL); - - if (r) { - CPUSET_DEL(cpus_hardware_enabled, cpu); - atomic_inc_32(&hardware_enable_failed); - cmn_err(CE_WARN, "kvm: enabling virtualization CPU%d failed\n", - cpu); - } } static void -hardware_disable(void *junk) +kvm_destroy_vm(struct kvm *kvmp) { - int cpu = curthread->t_cpu->cpu_id; + int ii; + void *cookie; - if (!CPU_IN_SET(cpus_hardware_enabled, cpu)) + if (kvmp == NULL) return; - CPUSET_DEL(cpus_hardware_enabled, cpu); - kvm_arch_hardware_disable(NULL); -} - -/* - * The following needs to run on each cpu. Currently, - * wait is always 1, so we use the kvm_xcall() routine which - * calls xc_sync. Later, if needed, the implementation can be - * changed to use xc_call or xc_call_nowait. - */ -#define on_each_cpu(func, info, wait) \ - /*CSTYLED*/ \ - ({ \ - kvm_xcall(KVM_CPUALL, func, info); \ - 0; \ - }) - -static void -hardware_disable_all_nolock(void) -{ - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable, NULL, 1); -} - -static void -hardware_disable_all(void) -{ - mutex_enter(&kvm_lock); - hardware_disable_all_nolock(); - mutex_exit(&kvm_lock); -} - -static int -hardware_enable_all(void) -{ - int r = 0; - - mutex_enter(&kvm_lock); - - kvm_usage_count++; - if (kvm_usage_count == 1) { - hardware_enable_failed = 0; - on_each_cpu(hardware_enable, NULL, 1); - - if (hardware_enable_failed) { - hardware_disable_all_nolock(); - r = EBUSY; - } - } + if (kvmp->kvm_kstat != NULL) + kstat_delete(kvmp->kvm_kstat); - mutex_exit(&kvm_lock); + kvm_arch_destroy_vm_comps(kvmp); - return (r); -} +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + kvm_coalesced_mmio_free(kvmp); +#endif -/* - * Note if we want to implement the kvm mmu notifier components than the - * following two functions will need to be readdressed. - */ -static int kvm_init_mmu_notifier(struct kvm *kvm) -{ - return (0); -} + list_remove(&vm_list, kvmp); + /* + * XXX: The fact that we're cleaning these up here means that we aren't + * properly cleaning them up somewhere else. + */ + cookie = NULL; + while (avl_destroy_nodes(&kvmp->kvm_avlmp, &cookie) != NULL) + continue; + avl_destroy(&kvmp->kvm_avlmp); + mutex_destroy(&kvmp->kvm_avllock); + mutex_destroy(&kvmp->slots_lock); + mutex_destroy(&kvmp->irq_lock); + mutex_destroy(&kvmp->lock); + mutex_destroy(&kvmp->requests_lock); + mutex_destroy(&kvmp->mmu_lock); + kvmp->mm = NULL; + kvm_fini_mmu_notifier(kvmp); -static void -kvm_fini_mmu_notifier(struct kvm *kvm) -{ -} + for (ii = 0; ii < KVM_NR_BUSES; ii++) + kmem_free(kvmp->buses[ii], sizeof (struct kvm_io_bus)); -void -kvm_arch_flush_shadow(struct kvm *kvm) -{ - kvm_mmu_zap_all(kvm); -#ifdef XXX - kvm_reload_remote_mmus(kvm); -#else - XXX_KVM_PROBE; + rw_destroy(&kvmp->kvm_rwlock); +#ifdef CONFIG_HAVE_KVM_IRQCHIP + /* + * These lists are contained by the pic. However, the pic isn't + */ + list_destroy(&kvmp->irq_ack_notifier_list); + list_destroy(&kvmp->mask_notifier_list); #endif + kvm_arch_destroy_vm(kvmp); } static struct kvm * @@ -997,143 +436,10 @@ kvm_create_vm(void) return (kvmp); } -static void -kvm_destroy_vm(struct kvm *kvmp) -{ - int ii; - void *cookie; - - if (kvmp == NULL) - return; - - if (kvmp->kvm_kstat != NULL) - kstat_delete(kvmp->kvm_kstat); - - kvm_arch_destroy_vm_comps(kvmp); - -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - kvm_coalesced_mmio_free(kvmp); -#endif - - list_remove(&vm_list, kvmp); - /* - * XXX: The fact that we're cleaning these up here means that we aren't - * properly cleaning them up somewhere else. - */ - cookie = NULL; - while (avl_destroy_nodes(&kvmp->kvm_avlmp, &cookie) != NULL) - continue; - avl_destroy(&kvmp->kvm_avlmp); - mutex_destroy(&kvmp->kvm_avllock); - mutex_destroy(&kvmp->slots_lock); - mutex_destroy(&kvmp->irq_lock); - mutex_destroy(&kvmp->lock); - mutex_destroy(&kvmp->requests_lock); - mutex_destroy(&kvmp->mmu_lock); - kvmp->mm = NULL; - kvm_fini_mmu_notifier(kvmp); - - for (ii = 0; ii < KVM_NR_BUSES; ii++) - kmem_free(kvmp->buses[ii], sizeof (struct kvm_io_bus)); - - rw_destroy(&kvmp->kvm_rwlock); -#ifdef CONFIG_HAVE_KVM_IRQCHIP - /* - * These lists are contained by the pic. However, the pic isn't - */ - list_destroy(&kvmp->irq_ack_notifier_list); - list_destroy(&kvmp->mask_notifier_list); -#endif - kvm_arch_destroy_vm(kvmp); -} - -static int -kvm_dev_ioctl_create_vm(kvm_devstate_t *ksp, intptr_t arg, int *rv) -{ - if (ksp->kds_kvmp != NULL) - return (EINVAL); - - ksp->kds_kvmp = kvm_create_vm(); - - if (ksp->kds_kvmp == NULL) { - cmn_err(CE_WARN, "Could not create new vm\n"); - return (EIO); - } - *rv = ksp->kds_kvmp->kvmid; - return (DDI_SUCCESS); -} - -static long -kvm_dev_ioctl_check_extension_generic(long arg, int *rv) -{ - switch (arg) { - case KVM_CAP_USER_MEMORY: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - case KVM_CAP_SET_BOOT_CPU_ID: -#endif - case KVM_CAP_INTERNAL_ERROR_DATA: - *rv = 1; - return (DDI_SUCCESS); -#ifdef CONFIG_HAVE_KVM_IRQCHIP - case KVM_CAP_IRQ_ROUTING: - *rv = KVM_MAX_IRQ_ROUTES; - return (DDI_SUCCESS); -#endif - default: - break; - } - return (kvm_dev_ioctl_check_extension(arg, rv)); -} - - -void -kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, - int user_alloc) -{ - - int npages = mem->memory_size >> PAGESHIFT; - - if (!user_alloc && !old.user_alloc && old.rmap && !npages) { - int ret = 0; - -#ifdef XXX - down_write(¤t->mm->mmap_sem); - ret = munmap(old.userspace_addr, - old.npages * PAGESIZE); - up_write(¤t->mm->mmap_sem); -#else - XXX_KVM_PROBE; - /* see comment in kvm_arch_prepare_memory_region */ - /* - * XXX this needs to be here, but I'm getting kernel heap - * corruption panics with someone writing to a buffer after it - * is freed - */ - kmem_free((caddr_t)old.userspace_addr, old.npages * PAGESIZE); -#endif - if (ret < 0) { - cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - } - - mutex_enter(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - mutex_exit(&kvm->mmu_lock); -} - /* * Free any memory in @free but not in @dont. */ -void +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { @@ -1174,6 +480,13 @@ kvm_free_physmem(struct kvm *kvm) kmem_free(kvm->memslots, sizeof (struct kvm_memslots)); } + +void +kvm_get_kvm(struct kvm *kvm) +{ + atomic_inc_32(&kvm->users_count); +} + /* * Allocate some memory and give it an address in the guest physical address * space. @@ -1400,763 +713,293 @@ kvm_set_memory_region(kvm_t *kvm, return (r); } - -static int -kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr) +int +kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, int user_alloc) { - /* - * XXX later, if adding other arch beside x86, need to do something - * else here - */ - return (kvm_x86_ops->set_tss_addr(kvmp, addr)); + if (mem->slot >= KVM_MEMORY_SLOTS) + return (EINVAL); + + return (kvm_set_memory_region(kvm, mem, user_alloc)); } -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +void +kvm_disable_largepages(void) { - /* ecx is often an input as well as an output. */ - __asm__ volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); + largepages_enabled = 0; } -#define __cpuid native_cpuid - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void -cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +int +is_error_pfn(pfn_t pfn) { - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); + return (pfn == bad_pfn); } -static void -do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index) +static unsigned long +bad_hva(void) { - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; + return (PAGEOFFSET); } -static int -is_efer_nx(void) +int +kvm_is_error_hva(unsigned long addr) { - unsigned long long efer = 0; - - rdmsrl_safe(MSR_EFER, &efer); - return (efer & EFER_NX); + return (addr == bad_hva()); } -#define F(x) bit(X86_FEATURE_##x) - -static void -do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function, - uint32_t index, int *nent, int maxnent) +struct kvm_memory_slot * +gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) { - unsigned int ddic; - unsigned f_nx = is_efer_nx() ? F(NX) : 0; -#ifdef CONFIG_X86_64 - unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) - ? F(GBPAGES) : 0; - unsigned f_lm = F(LM); + int i; +#ifdef XXX_KVM_DECLARATION + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); #else - unsigned f_gbpages = 0; - unsigned f_lm = 0; + struct kvm_memslots *slots = kvm->memslots; #endif - unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - - /* cpuid 1.edx */ - const uint32_t kvm_supported_word0_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */; - /* cpuid 0x80000001.edx */ - const uint32_t kvm_supported_word1_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); - /* cpuid 1.ecx */ - const uint32_t kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; - /* cpuid 0x80000001.ecx */ - const uint32_t kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | - 0 /* SKINIT */ | 0 /* WDT */; - - /* all calls to cpuid_count() should be made on the same cpu */ - /* XXX - right now, system panics at ddi_exit_critical() */ - /* XXX - to run everything on same cpu, bind qemu at startup */ - kpreempt_disable(); - - do_cpuid_1_ent(entry, function, index); - ++*nent; + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; - switch (function) { - case 0: - entry->eax = min(entry->eax, (uint32_t)0xb); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word4_x86_features; - /* - * we support x2apic emulation even if host does not support - * it since we emulate x2apic in software - */ - entry->ecx |= F(X2APIC); - break; - /* - * function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT - */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times && *nent < maxnent; ++t) { - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 and 0xb have additional index. */ - case 4: { - int i, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - cache_type = entry[i - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xb: { - int i, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - entry->ecx &= kvm_supported_word6_x86_features; - break; + if (gfn >= memslot->base_gfn && + gfn < memslot->base_gfn + memslot->npages) + return (memslot); } - /* - * XXX - see comment above for ddi_enter_critical() - * - * ddi_exit_critical(ddic); - */ - kpreempt_enable(); + return (NULL); } -#undef F - -static int -kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 *entries) +struct kvm_memory_slot * +gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = E2BIG; - uint32_t func; - int allocsize = 0; - - if (cpuid->nent < 1) - goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; - r = ENOMEM; - allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent; - cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP); - - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); - - r = E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); - r = E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - r = EFAULT; - if (copyout(cpuid_entries, entries, - nent * sizeof (kvm_cpuid_entry2_t))) - goto out_free; - - cpuid->nent = nent; - r = 0; - -out_free: - kmem_free(cpuid_entries, allocsize); -out: - return (r); + gfn = unalias_gfn(kvm, gfn); + return (gfn_to_memslot_unaliased(kvm, gfn)); } -static inline void -__vmwrite(unsigned long field, unsigned long value) +int +kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { - uint8_t err = 0; - - /*CSTYLED*/ - __asm__ volatile ( ASM_VMX_VMWRITE_RAX_RDX "\n\t" "setna %0" - /* XXX: CF==1 or ZF==1 --> crash (ud2) */ - /* "ja 1f ; ud2 ; 1:\n" */ - : "=q"(err) : "a" (value), "d" (field) - : "cc", "memory"); - - /* XXX the following should be ifdef debug... */ - if (err) { -#ifdef XXX - vmcs_read32(VM_INSTRUCTION_ERROR); - cmn_err(CE_WARN, "_vmwrite: error writing %lx to %lx: " - "error number = %d\n", value, field, err & 0xff); + int i; +#ifdef XXX_KVM_DECLARATION + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); #else - XXX_KVM_PROBE; + struct kvm_memslots *slots = kvm->memslots; #endif - } -} - -void -kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - -static int -kvm_request_guest_time_update(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - if (!vcpu->time_page) - return (0); + gfn = unalias_gfn_instantiation(kvm, gfn); - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; - return (1); -} + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; -void -kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - kvm_x86_ops->vcpu_load(vcpu, cpu); -#ifdef XXX - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { - unsigned long khz = cpufreq_quick_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; + if (gfn >= memslot->base_gfn && + gfn < memslot->base_gfn + memslot->npages) { + return (1); + } } -#else - XXX_KVM_PROBE; -#endif - kvm_request_guest_time_update(vcpu); -} - -void -kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_fpu_reload); - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); - KVM_TRACE1(fpu, int, 0); -} - -/* straight from xen code... */ -void -ldt_load(void) -{ - *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc; - wr_ldtr(ULDT_SEL); -} - -inline int -is_pae(struct kvm_vcpu *vcpu) -{ - return (kvm_read_cr4_bits(vcpu, X86_CR4_PAE)); -} - -void -kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_put_guest_fpu(vcpu); - - kvm_x86_ops->vcpu_put(vcpu); -} - -void -kvm_user_return_notifier_register(struct kvm_vcpu *vcpu, - struct kvm_user_return_notifier *urn) -{ - vcpu->urn = urn; -} - -void -kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu, - struct kvm_user_return_notifier *urn) -{ - vcpu->urn = NULL; -} - -void -kvm_fire_urn(struct kvm_vcpu *vcpu) -{ - if (vcpu->urn) - vcpu->urn->on_user_return(vcpu, vcpu->urn); -} - -/* - * Called when we've been asked to save our context. i.e. we're being swapped - * out. - */ -void -kvm_ctx_save(void *arg) -{ - struct kvm_vcpu *vcpu = arg; - kvm_arch_vcpu_put(vcpu); - kvm_fire_urn(vcpu); -} - -/* - * Called when we're being asked to restore our context. i.e. we're returning - * from being swapped out. - */ -void -kvm_ctx_restore(void *arg) -{ - int cpu; - cpu = CPU->cpu_seqid; - struct kvm_vcpu *vcpu = arg; - kvm_arch_vcpu_load(vcpu, cpu); + return (0); } -/* - * Switches to specified vcpu, until a matching vcpu_put() - */ -void -vcpu_load(struct kvm_vcpu *vcpu) +unsigned long +kvm_host_page_size(struct kvm *kvm, gfn_t gfn) { - int cpu; + struct vm_area_struct *vma; + unsigned long addr, size; - mutex_enter(&vcpu->mutex); - kpreempt_disable(); - cpu = CPU->cpu_seqid; - installctx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL, - NULL, NULL, NULL); - kvm_arch_vcpu_load(vcpu, cpu); - kpreempt_enable(); -} + size = PAGESIZE; -void -vcpu_put(struct kvm_vcpu *vcpu) -{ - kpreempt_disable(); - kvm_arch_vcpu_put(vcpu); - kvm_fire_urn(vcpu); - removectx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL, - NULL, NULL, NULL); - kpreempt_enable(); - mutex_exit(&vcpu->mutex); -} + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return (PAGESIZE); -/* - * find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) - */ -static int -is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - uint32_t function, uint32_t index) -{ - if (e->function != function) - return (0); - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return (0); - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return (0); - return (1); -} +#ifdef XXX + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; -static int -move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return (j); - } - } + size = vma_kernel_pagesize(vma); - return (0); /* silence gcc, even though control never reaches here */ +out: + up_read(¤t->mm->mmap_sem); + return (size); +#else + XXX_KVM_PROBE; + return (PAGESIZE); +#endif } -struct kvm_cpuid_entry2 * -kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index) +int +memslot_id(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_cpuid_entry2 *best = NULL; +#ifdef XXX_KVM_DECLARATION + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); +#else + struct kvm_memslots *slots = kvm->memslots; +#endif + struct kvm_memory_slot *memslot = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; + gfn = unalias_gfn(kvm, gfn); + for (i = 0; i < slots->nmemslots; ++i) { + memslot = &slots->memslots[i]; - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; + if (gfn >= memslot->base_gfn && + gfn < memslot->base_gfn + memslot->npages) break; - } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; } - return (best); + return (memslot - slots->memslots); } -static int -kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +unsigned long +gfn_to_hva(struct kvm *kvm, gfn_t gfn) { - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - return (E2BIG); - - bcopy(cpuid->entries, vcpu->arch.cpuid_entries, - cpuid->nent * sizeof (struct kvm_cpuid_entry2)); + struct kvm_memory_slot *slot; - vcpu_load(vcpu); - vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); + gfn = unalias_gfn_instantiation(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return (bad_hva()); - return (0); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE); } -static int -kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +static pfn_t +hva_to_pfn(struct kvm *kvm, unsigned long addr) { - int r; - struct kvm_cpuid_entry2 *entries = cpuid->entries; - - cpuid->nent = vcpu->arch.cpuid_nent; - - if (cpuid->nent < vcpu->arch.cpuid_nent) - return (E2BIG); + page_t page[1]; + int npages; + pfn_t pfn; + proc_t *procp = ttoproc(curthread); + struct as *as = procp->p_as; - bcopy(&vcpu->arch.cpuid_entries, cpuid->entries, - vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2)); +#ifdef XXX - return (0); -} + npages = get_user_pages_fast(addr, 1, 1, page); -unsigned long -kvm_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags; + if (unlikely(npages != 1)) { + struct vm_area_struct *vma; - rflags = kvm_x86_ops->get_rflags(vcpu); + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); + if (vma == NULL || addr < vma->vm_start || + !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); + get_page(bad_page); + return (page_to_pfn(bad_page)); + } - return (rflags); + pfn = ((addr - vma->vm_start) >> PAGESHIFT) + vma->vm_pgoff; + up_read(¤t->mm->mmap_sem); + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else + pfn = page_to_pfn(page[0]); +#else + XXX_KVM_PROBE; + if (addr < kernelbase) + pfn = hat_getpfnum(as->a_hat, (caddr_t)addr); + else + pfn = hat_getpfnum(kas.a_hat, (caddr_t)addr); +#endif + return (pfn); } -int -kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +pfn_t +gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - vcpu_load(vcpu); - - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); - regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); - regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); - regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); - regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); - regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); -#ifdef CONFIG_X86_64 - regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); - regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); - regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); - regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); - regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); - regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); - regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); - regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); -#endif + unsigned long addr; + pfn_t pfn; - regs->rip = kvm_rip_read(vcpu); - regs->rflags = kvm_get_rflags(vcpu); + addr = gfn_to_hva(kvm, gfn); - vcpu_put(vcpu); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return (page_to_pfn(bad_page)); + } - return (0); -} + pfn = hva_to_pfn(kvm, addr); -void -kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); + return (pfn); } -static uint16_t -get_segment_selector(struct kvm_vcpu *vcpu, int seg) +page_t * +gfn_to_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_segment kvm_seg; + pfn_t pfn = gfn_to_pfn(kvm, gfn); - kvm_get_segment(vcpu, &kvm_seg, seg); + if (!kvm_is_mmio_pfn(pfn)) + return (pfn_to_page(pfn)); - return (kvm_seg.selector); + get_page(bad_page); + return (bad_page); } void -kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - vcpu->arch.singlestep_cs == get_segment_selector(vcpu, - VCPU_SREG_CS) && vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) { - rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - } - - kvm_x86_ops->set_rflags(vcpu, rflags); -} - -int -kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +kvm_release_pfn_clean(pfn_t pfn) { - vcpu_load(vcpu); - - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); - kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); - kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); - kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); - kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); - kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); - kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); - kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); -#ifdef CONFIG_X86_64 - kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); - kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); - kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); - kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); - kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); - kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); - kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); - kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); +#ifdef XXX + if (!kvm_is_mmio_pfn(pfn)) + put_page(pfn_to_page(pfn)); +#else + XXX_KVM_PROBE; #endif - - kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags); - - vcpu->arch.exception.pending = 0; - - vcpu_put(vcpu); - - return (0); -} - -int -kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); - - memcpy(fpu->fpr, fxsave->st_space, 128); - fpu->fcw = fxsave->cwd; - fpu->fsw = fxsave->swd; - fpu->ftwx = fxsave->twd; - fpu->last_opcode = fxsave->fop; - fpu->last_ip = fxsave->rip; - fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof (fxsave->xmm_space)); - - vcpu_put(vcpu); - - return (0); } -int -kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +void +kvm_release_page_dirty(page_t *page) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); - - memcpy(fxsave->st_space, fpu->fpr, 128); - fxsave->cwd = fpu->fcw; - fxsave->swd = fpu->fsw; - fxsave->twd = fpu->ftwx; - fxsave->fop = fpu->last_opcode; - fxsave->rip = fpu->last_ip; - fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof (fxsave->xmm_space)); - - vcpu_put(vcpu); - - return (0); + kvm_release_pfn_dirty(page_to_pfn(page)); } -unsigned long -kvm_get_cr8(struct kvm_vcpu *vcpu) +void +kvm_release_pfn_dirty(pfn_t pfn) { - if (irqchip_in_kernel(vcpu->kvm)) { - return (kvm_lapic_get_cr8(vcpu)); - } else { - return (vcpu->arch.cr8); - } + kvm_set_pfn_dirty(pfn); + kvm_release_pfn_clean(pfn); } -int -kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +void +kvm_set_pfn_dirty(pfn_t pfn) { - struct descriptor_table dt; - - vcpu_load(vcpu); - - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.limit; - sregs->idt.base = dt.base; - kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.limit; - sregs->gdt.base = dt.base; - - sregs->cr0 = kvm_read_cr0(vcpu); - sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = kvm_read_cr4(vcpu); - sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.efer; - sregs->apic_base = kvm_get_apic_base(vcpu); - - memset(sregs->interrupt_bitmap, 0, sizeof (sregs->interrupt_bitmap)); - - if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) { - set_bit(vcpu->arch.interrupt.nr, - (unsigned long *)sregs->interrupt_bitmap); +#ifdef XXX + if (!kvm_is_mmio_pfn(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + SetPageDirty(page); /* XXX - not defined in linux?! */ } - - vcpu_put(vcpu); - - return (0); -} - -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - - -inline void -kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector, int soft) -{ - vcpu->arch.interrupt.pending = 1; - vcpu->arch.interrupt.soft = soft; - vcpu->arch.interrupt.nr = vector; +#else + XXX_KVM_PROBE; +#endif } -inline unsigned long -bad_hva(void) +void +kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn) { - return (PAGEOFFSET); +#ifdef XXX + if (!kvm_is_mmio_pfn(pfn)) + mark_page_accessed(pfn_to_page(pfn)); +#else + XXX_KVM_PROBE; +#endif } -unsigned long -gfn_to_hva(struct kvm *kvm, gfn_t gfn) +void +kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn) { - struct kvm_memory_slot *slot; - - gfn = unalias_gfn_instantiation(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return (bad_hva()); - - return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE); + if (!kvm_is_mmio_pfn(pfn)) + get_page(pfn_to_page(pfn)); } -int -kvm_is_error_hva(unsigned long addr) +static int +next_segment(unsigned long len, int offset) { - return (addr == bad_hva()); + if (len > PAGESIZE - offset) + return (PAGESIZE - offset); + else + return (len); } int @@ -2182,258 +1025,55 @@ kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len) return (0); } -/* - * Load the pae pdptrs. Return true is they are all valid. - */ int -load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { - gfn_t pdpt_gfn = cr3 >> PAGESHIFT; - unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2; - int i; + gfn_t gfn = gpa >> PAGESHIFT; + int seg; + int offset = offset_in_page(gpa); int ret; - uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, - pdpte, offset * sizeof (uint64_t), sizeof (pdpte)); - - if (ret < 0) { - ret = 0; - goto out; - } - - for (i = 0; i < ARRAY_SIZE(pdpte); i++) { - if (is_present_gpte(pdpte[i]) && - (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { - ret = 0; - goto out; - } - } - ret = 1; - - memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty); -out: - return (ret); -} - -static void -update_cr8_intercept(struct kvm_vcpu *vcpu) -{ - int max_irr, tpr; - - if (!kvm_x86_ops->update_cr8_intercept) - return; - - if (!vcpu->arch.apic) - return; - if (!vcpu->arch.apic->vapic_addr) - max_irr = kvm_lapic_find_highest_irr(vcpu); - else - max_irr = -1; - - if (max_irr != -1) - max_irr >>= 4; - tpr = kvm_lapic_get_cr8(vcpu); - - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); -} - -inline int -is_protmode(struct kvm_vcpu *vcpu) -{ - return (kvm_read_cr0_bits(vcpu, X86_CR0_PE)); -} - -int -kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) -{ - return (vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id); -} - -unsigned long -find_next_bit(const unsigned long *addr, - unsigned long size, unsigned long offset) -{ - const unsigned long *p = addr + (offset/64); - unsigned long result = offset & ~(64-1); - unsigned long tmp; - - if (offset >= size) - return (size); - - size -= result; - offset %= 64; - - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < 64) - goto found_first; - if (tmp) - goto found_middle; - size -= 64; - result += 64; - } - while (size & ~(64-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += 64; - size -= 64; - } - - if (!size) - return (result); - tmp = *p; - -found_first: - tmp &= (~0UL >> (64 - size)); - if (tmp == 0UL) /* Are any bits set? */ - return (result + size); /* Nope. */ -found_middle: - return (result + __ffs(tmp)); -} - -int -kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - int mmu_reset_needed = 0; - int pending_vec, max_bits; - struct descriptor_table dt; - - vcpu_load(vcpu); - - dt.limit = sregs->idt.limit; - dt.base = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); - dt.limit = sregs->gdt.limit; - dt.base = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); - - vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; - - kvm_set_cr8(vcpu, sregs->cr8); - - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); - kvm_set_apic_base(vcpu, sregs->apic_base); - - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; - - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - - if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.cr3); - mmu_reset_needed = 1; - } - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); - - max_bits = (sizeof (sregs->interrupt_bitmap)) << 3; - pending_vec = - find_next_bit((const unsigned long *)sregs->interrupt_bitmap, - max_bits, 0); + uintptr_t dp = (uintptr_t)data; - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, 0); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, (void *)dp, offset, seg); + if (ret < 0) + return (ret); + offset = 0; + len -= seg; + dp += seg; + ++gfn; } - - kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - update_cr8_intercept(vcpu); - -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - /* Older userspace won't unhalt the vcpu on reset. */ - if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && - sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -#endif /* CONFIG_KVM_APIC_ARCHITECTURE */ - - vcpu_put(vcpu); - return (0); } -static void -kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +int +kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { - static int version; - struct pvclock_wall_clock wc; - struct timespec boot; - - if (!wall_clock) - return; - - version++; + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGESHIFT; + int offset = offset_in_page(gpa); - kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return (-EFAULT); - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. - */ #ifdef XXX - getboottime(&boot); - - wc.sec = boot.tv_sec; - wc.nsec = boot.tv_nsec; - wc.version = version; - - kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc)); - - version++; - kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); + pagefault_disable(); #else XXX_KVM_PROBE; #endif -} - -static int -next_segment(unsigned long len, int offset) -{ - if (len > PAGESIZE - offset) - return (PAGESIZE - offset); - else - return (len); -} -void -mark_page_dirty(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *memslot; - - gfn = unalias_gfn(kvm, gfn); - memslot = gfn_to_memslot_unaliased(kvm, gfn); - - if (memslot && memslot->dirty_bitmap) { - unsigned long rel_gfn = gfn - memslot->base_gfn; - unsigned long *p = memslot->dirty_bitmap + rel_gfn / 64; - int offset = rel_gfn % 64; + r = copyin((caddr_t)addr + offset, data, len); +#ifdef XXX + pagefault_enable(); +#else + XXX_KVM_PROBE; +#endif + if (r) + return (-EFAULT); - /* avoid RMW */ - if (!test_bit(offset, p)) - __set_bit(offset, p); - } + return (0); } int @@ -2484,1010 +1124,277 @@ kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) return (0); } -static int -xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - uint8_t *blob_addr = lm ? - (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 : - (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - uint8_t blob_size = lm ? - kvm->arch.xen_hvm_config.blob_size_64 : - kvm->arch.xen_hvm_config.blob_size_32; - uint32_t page_num = data & ~PAGEMASK; - uint64_t page_addr = data & PAGEMASK; - uint8_t *page; - int r; - - r = E2BIG; - if (page_num >= blob_size) - goto out; - r = ENOMEM; - page = kmem_alloc(PAGESIZE, KM_SLEEP); - r = EFAULT; - if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE)) - goto out_free; - if (kvm_write_guest(kvm, page_addr, page, PAGESIZE)) - goto out_free; - r = 0; -out_free: - kmem_free(page, PAGESIZE); -out: - return (r); -} - -static void -set_efer(struct kvm_vcpu *vcpu, uint64_t efer) -{ - if (efer & efer_reserved_bits) { - kvm_inject_gp(vcpu, 0); - return; - } - - if (is_paging(vcpu) && - (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { - kvm_inject_gp(vcpu, 0); - return; - } - - if (efer & EFER_FFXSR) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - kvm_inject_gp(vcpu, 0); - return; - } - } - - if (efer & EFER_SVME) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - kvm_inject_gp(vcpu, 0); - return; - } - } - - kvm_x86_ops->set_efer(vcpu, efer); - - efer &= ~EFER_LMA; - efer |= vcpu->arch.efer & EFER_LMA; - - vcpu->arch.efer = efer; - - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - kvm_mmu_reset_context(vcpu); -} - -static int -msr_mtrr_valid(unsigned msr) -{ - switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: - case MSR_MTRRfix64K_00000: - case MSR_MTRRfix16K_80000: - case MSR_MTRRfix16K_A0000: - case MSR_MTRRfix4K_C0000: - case MSR_MTRRfix4K_C8000: - case MSR_MTRRfix4K_D0000: - case MSR_MTRRfix4K_D8000: - case MSR_MTRRfix4K_E0000: - case MSR_MTRRfix4K_E8000: - case MSR_MTRRfix4K_F0000: - case MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: - return (1); - case 0x2f8: - return (1); - } - - return (0); -} - -static int -valid_pat_type(unsigned t) -{ - return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */ -} - -static int -valid_mtrr_type(unsigned t) -{ - return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */ -} - -static int -mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +int +kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - int i; - - if (!msr_mtrr_valid(msr)) - return (0); - - if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return (0); - return (1); - } else if (msr == MSR_MTRRdefType) { - if (data & ~0xcff) - return (0); - return (valid_mtrr_type(data & 0xff)); - } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { - for (i = 0; i < 8; i++) - if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) - return (0); - return (1); - } - - /* variable MTRRs */ - return (valid_mtrr_type(data & 0xff)); + return (kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len)); } -static int -set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +void +mark_page_dirty(struct kvm *kvm, gfn_t gfn) { - struct mtrr_state_type *state = &vcpu->arch.mtrr_state; - - uint64_t *p = (uint64_t *)&state->fixed_ranges; + struct kvm_memory_slot *memslot; - if (!mtrr_valid(vcpu, msr, data)) - return (1); + gfn = unalias_gfn(kvm, gfn); + memslot = gfn_to_memslot_unaliased(kvm, gfn); - if (msr == MSR_MTRRdefType) { - state->def_type = data; - state->enabled = (data & 0xc00) >> 10; - } else if (msr == MSR_MTRRfix64K_00000) - p[0] = data; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - p[1 + msr - MSR_MTRRfix16K_80000] = data; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - p[3 + msr - MSR_MTRRfix4K_C0000] = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - uint64_t *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - - if (!is_mtrr_mask) { - pt = (uint64_t *)&state->var_ranges[idx].base_lo; - } else { - pt = (uint64_t *)&state->var_ranges[idx].mask_lo; - } + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + unsigned long *p = memslot->dirty_bitmap + rel_gfn / 64; + int offset = rel_gfn % 64; - *pt = data; + /* avoid RMW */ + if (!test_bit(offset, p)) + __set_bit(offset, p); } - - kvm_mmu_reset_context(vcpu); - - return (0); -} - -int -clear_user(void *addr, unsigned long size) -{ - caddr_t ka; - int rval = 0; - - ka = kmem_zalloc(size, KM_SLEEP); - rval = copyout(ka, addr, size); - kmem_free(ka, size); - - return (rval); } -static int -set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void +kvm_vcpu_block(struct kvm_vcpu *vcpu) { - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - vcpu->arch.hv_vapic = data; + for (;;) { + if (kvm_arch_vcpu_runnable(vcpu)) { + set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } - addr = gfn_to_hva(vcpu->kvm, - data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); - - if (kvm_is_error_hva(addr)) - return (1); - - if (clear_user((void *)addr, PAGESIZE)) - return (1); - - vcpu->arch.hv_vapic = data; - break; - } - - case HV_X64_MSR_EOI: - return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data)); - case HV_X64_MSR_ICR: - return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data)); - case HV_X64_MSR_TPR: - return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data)); - - default: - cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%lx\n", msr, data); - return (1); - } - - return (0); -} + if (issig(JUSTLOOKING)) + break; -static int -set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) -{ - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - kvm->arch.hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!kvm->arch.hv_guest_os_id) - kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - uint64_t gfn; - unsigned long addr; - uint8_t instructions[4]; + mutex_enter(&vcpu->kvcpu_kick_lock); - /* if guest os id is not set hypercall should remain disabled */ - if (!kvm->arch.hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - kvm->arch.hv_hypercall = data; + if (kvm_cpu_has_pending_timer(vcpu)) { + mutex_exit(&vcpu->kvcpu_kick_lock); break; } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return (1); - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (copyout(instructions, (caddr_t)addr, 4)) - return (1); - kvm->arch.hv_hypercall = data; - break; - } - default: - cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%lx\n", msr, data); - return (1); - } - - return (0); -} -static int -set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) -{ - uint64_t mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; + (void) cv_wait_sig_swap(&vcpu->kvcpu_kick_cv, + &vcpu->kvcpu_kick_lock); - switch (msr) { - case MSR_IA32_MCG_STATUS: - vcpu->arch.mcg_status = data; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return (1); - if (data != 0 && data != ~(uint64_t)0) - return (-1); - vcpu->arch.mcg_ctl = data; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { - uint32_t offset = msr - MSR_IA32_MC0_CTL; - /* - * only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(uint64_t)0) - return (-1); - vcpu->arch.mce_banks[offset] = data; - break; - } - return (1); + mutex_exit(&vcpu->kvcpu_kick_lock); } - return (0); } -static int -kvm_hv_msr_partition_wide(uint32_t msr) +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +int +kvm_vm_ioctl_create_vcpu(struct kvm *kvm, uint32_t id, int *rval_p) { - int r = 0; - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - r = 1; - break; - } + int r, i; + struct kvm_vcpu *vcpu, *v; - return (r); -} + vcpu = kvm_arch_vcpu_create(kvm, id); + if (vcpu == NULL) + return (EINVAL); -#ifdef XXX_KVM_DECLARATION -#define pfn_valid(pfn) ((pfn < physmax) && (pfn != PFN_INVALID)) +#ifdef XXX + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); #else -#define pfn_valid(pfn) (pfn != PFN_INVALID) + XXX_KVM_PROBE; #endif -inline int -kvm_is_mmio_pfn(pfn_t pfn) -{ - if (pfn_valid(pfn)) { + r = kvm_arch_vcpu_setup(vcpu); + if (r) + return (r); + + mutex_enter(&kvm->lock); + #ifdef XXX - struct page *page = compound_head(pfn_to_page(pfn)); - return (PageReserved(page)); + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; + if (kvm->online_vcpus == KVM_MAX_VCPUS) { #endif - return (0); - } else - return (1); -} + r = EINVAL; + goto vcpu_destroy; + } -page_t * -gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - pfn_t pfn = gfn_to_pfn(kvm, gfn); + /* kvm_for_each_vcpu(r, v, kvm) */ + for (i = 0; i < kvm->online_vcpus; i++) { + v = kvm->vcpus[i]; + if (v->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + } - if (!kvm_is_mmio_pfn(pfn)) - return (pfn_to_page(pfn)); + /* BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); */ - get_page(bad_page); - return (bad_page); -} + /* Now it's all set up, let userspace reach it */ + kvm_get_kvm(kvm); -void -kvm_release_page_dirty(page_t *page) -{ - kvm_release_pfn_dirty(page_to_pfn(page)); -} + *rval_p = kvm->online_vcpus; /* guarantee unique id */ + vcpu->vcpu_id = *rval_p; + + /* XXX need to protect online_vcpus */ + kvm->vcpus[kvm->online_vcpus] = vcpu; -int -kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) -{ - switch (msr) { - case MSR_EFER: - set_efer(vcpu, data); - break; - case MSR_K7_HWCR: - data &= ~(uint64_t)0x40; /* ignore flush filter disable */ - if (data != 0) { - cmn_err(CE_NOTE, - "unimplemented HWCR wrmsr: 0x%lx\n", data); - return (1); - } - break; - case MSR_FAM10H_MMIO_CONF_BASE: - if (data != 0) { - cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%lx\n", data); - return (1); - } - break; - case MSR_AMD64_NB_CFG: - break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* - * Values other than LBR and BTF are vendor-specific, - * thus reserved and should throw a #GP - */ - return (1); - } - cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n", - __func__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: - break; - case 0x200 ... 0x2ff: - return (set_msr_mtrr(vcpu, msr, data)); - case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return (kvm_x2apic_msr_write(vcpu, msr, data)); - case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; - break; - case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data); - break; - case MSR_KVM_SYSTEM_TIME: { #ifdef XXX - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + smp_wmb(); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif + atomic_inc_32(&kvm->online_vcpus); - vcpu->arch.time = data; +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif - /* we verify if the enable bit is set... */ - if (!(data & 1)) - break; + mutex_exit(&kvm->lock); + return (r); - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1); +vcpu_destroy: #ifdef XXX - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGESHIFT); - - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } - - kvm_request_guest_time_update(vcpu); + mutex_exit(&kvm->lock); + kvm_arch_vcpu_destroy(vcpu); #else - XXX_KVM_PROBE; + XXX_KVM_PROBE; #endif - break; - } - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: - return (set_msr_mce(vcpu, msr, data)); - - /* - * Performance counters are not protected by a CPUID bit, so we should - * check all of them in the generic path for the sake of cross vendor - * migration. Writing a zero into the event select MSRs disables them, - * which we perfectly emulate ;-). Any other value should be at least - * reported, some guests depend on them. - */ - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - if (data != 0) - cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " - "0x%x data 0x%lx\n", msr, data); - break; - /* - * at least RHEL 4 unconditionally writes to the perfctr registers, - * so we ignore writes to make it happy. - */ - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " - "0x%x data 0x%lx\n", msr, data); - break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_enter(&vcpu->kvm->lock); - r = set_msr_hyperv_pw(vcpu, msr, data); - mutex_exit(&vcpu->kvm->lock); - return (r); - } else - return (set_msr_hyperv(vcpu, msr, data)); - break; - default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return (xen_hvm_config(vcpu, data)); - if (!ignore_msrs) { - cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %lx\n", - msr, data); - return (1); - } else { - cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %lx\n", - msr, data); - break; - } - } - - return (0); -} - -static int -get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) -{ - struct mtrr_state_type *state = &vcpu->arch.mtrr_state; - uint64_t *p = (uint64_t *)&state->fixed_ranges; - - if (!msr_mtrr_valid(msr)) - return (1); - - if (msr == MSR_MTRRdefType) - *pdata = vcpu->arch.mtrr_state.def_type + - (vcpu->arch.mtrr_state.enabled << 10); - else if (msr == MSR_MTRRfix64K_00000) - *pdata = p[0]; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - uint64_t *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) { - pt = (uint64_t *)&state->var_ranges[idx].base_lo; - } else { - pt = (uint64_t *)&state->var_ranges[idx].mask_lo; - } - - *pdata = *pt; - } - - return (0); + return (r); } static int -get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) { - uint64_t data = 0; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) - data = r; - break; - } - case HV_X64_MSR_EOI: - return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata)); - case HV_X64_MSR_ICR: - return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata)); - case HV_X64_MSR_TPR: - return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata)); - default: - cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return (1); - } + if (sigset) { + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; - *pdata = data; return (0); } static int -get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +kvm_dev_ioctl_create_vm(kvm_devstate_t *ksp, intptr_t arg, int *rv) { - uint64_t data = 0; - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = kvm->arch.hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = kvm->arch.hv_hypercall; - break; - default: - cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return (1); - } - - *pdata = data; + if (ksp->kds_kvmp != NULL) + return (EINVAL); - return (0); -} + ksp->kds_kvmp = kvm_create_vm(); -static int -get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) -{ - uint64_t data; - uint64_t mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - data = 0; - break; - case MSR_IA32_MCG_CAP: - data = vcpu->arch.mcg_cap; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return (1); - data = vcpu->arch.mcg_ctl; - break; - case MSR_IA32_MCG_STATUS: - data = vcpu->arch.mcg_status; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { - uint32_t offset = msr - MSR_IA32_MC0_CTL; - data = vcpu->arch.mce_banks[offset]; - break; - } - return (1); + if (ksp->kds_kvmp == NULL) { + cmn_err(CE_WARN, "Could not create new vm\n"); + return (EIO); } - *pdata = data; - return (0); + *rv = ksp->kds_kvmp->kvmid; + return (DDI_SUCCESS); } int -kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +kvm_dev_ioctl_check_extension_generic(long arg, int *rv) { - uint64_t data; - - switch (msr) { - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: - case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: - case MSR_IA32_LASTBRANCHFROMIP: - case MSR_IA32_LASTBRANCHTOIP: - case MSR_IA32_LASTINTFROMIP: - case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: - case MSR_K7_HWCR: - case MSR_VM_HSAVE_PA: - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - case MSR_K7_EVNTSEL0: - case MSR_K7_PERFCTR0: - case MSR_K8_INT_PENDING_MSG: - case MSR_AMD64_NB_CFG: - case MSR_FAM10H_MMIO_CONF_BASE: - data = 0; - break; - case MSR_MTRRcap: - data = 0x500 | KVM_NR_VAR_MTRR; - break; - case 0x200 ... 0x2ff: - return (get_msr_mtrr(vcpu, msr, pdata)); - case 0xcd: /* fsb frequency */ - data = 3; - break; - case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); - break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return (kvm_x2apic_msr_read(vcpu, msr, pdata)); - break; - case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; - break; - case MSR_IA32_PERF_STATUS: - /* TSC increment by tick */ - data = 1000ULL; - /* CPU multiplier */ - data |= (((uint64_t)4ULL) << 40); - break; - case MSR_EFER: - data = vcpu->arch.efer; - break; - case MSR_KVM_WALL_CLOCK: - data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_SYSTEM_TIME: - data = vcpu->arch.time; - break; - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: - return (get_msr_mce(vcpu, msr, pdata)); - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_enter(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr, pdata); - mutex_exit(&vcpu->kvm->lock); - return (r); - } else - return (get_msr_hyperv(vcpu, msr, pdata)); - break; + switch (arg) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif + case KVM_CAP_INTERNAL_ERROR_DATA: + *rv = 1; + return (DDI_SUCCESS); +#ifdef CONFIG_HAVE_KVM_IRQCHIP + case KVM_CAP_IRQ_ROUTING: + *rv = KVM_MAX_IRQ_ROUTES; + return (DDI_SUCCESS); +#endif default: - if (!ignore_msrs) { - cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr); - return (1); - } else { - cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr); - data = 0; - } break; } - *pdata = data; - - return (0); + return (kvm_dev_ioctl_check_extension(arg, rv)); } -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int -__msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, uint64_t *data)) +static void +hardware_enable(void *junk) { - int i, idx; - - vcpu_load(vcpu); - -#ifdef XXX - idx = srcu_read_lock(&vcpu->kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - for (i = 0; i < msrs->nmsrs; i++) { - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; - } - -#ifdef XXX - srcu_read_unlock(&vcpu->kvm->srcu, idx); -#else - XXX_KVM_SYNC_PROBE; -#endif - vcpu_put(vcpu); - - return (i); -} + int cpu; + int r; -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int -kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata) -{ - return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata)); -} + cpu = curthread->t_cpu->cpu_id; -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int -kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data) -{ - return (kvm_x86_ops->set_msr(vcpu, msr_index, data)); -} + if (CPU_IN_SET(cpus_hardware_enabled, cpu)) + return; -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int -do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data) -{ - return (kvm_set_msr(vcpu, index, *data)); -} + CPUSET_ADD(cpus_hardware_enabled, cpu); -#define EXCPT_BENIGN 0 -#define EXCPT_CONTRIBUTORY 1 -#define EXCPT_PF 2 + r = kvm_arch_hardware_enable(NULL); -static int -exception_class(int vector) -{ - switch (vector) { - case PF_VECTOR: - return (EXCPT_PF); - case DE_VECTOR: - case TS_VECTOR: - case NP_VECTOR: - case SS_VECTOR: - case GP_VECTOR: - return (EXCPT_CONTRIBUTORY); - default: - break; + if (r) { + CPUSET_DEL(cpus_hardware_enabled, cpu); + atomic_inc_32(&hardware_enable_failed); + cmn_err(CE_WARN, "kvm: enabling virtualization CPU%d failed\n", + cpu); } - - return (EXCPT_BENIGN); } static void -kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, int has_error, uint32_t error_code) +hardware_disable(void *junk) { - uint32_t prev_nr; - int class1, class2; - - if (!vcpu->arch.exception.pending) { -queue: - vcpu->arch.exception.pending = 1; - vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; - return; - } + int cpu = curthread->t_cpu->cpu_id; - /* to check exception */ - prev_nr = vcpu->arch.exception.nr; - if (prev_nr == DF_VECTOR) { - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + if (!CPU_IN_SET(cpus_hardware_enabled, cpu)) return; - } - class1 = exception_class(prev_nr); - class2 = exception_class(nr); - if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || - (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { - /* generate double fault per SDM Table 5-5 */ - vcpu->arch.exception.pending = 1; - vcpu->arch.exception.has_error_code = 1; - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - } else { - /* - * replace previous exception with a new one in a hope - * that instruction re-execution will regenerate lost - * exception - */ - goto queue; - } -} - -void -kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, 0, 0); -} -void -kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code) -{ - kvm_multiple_exception(vcpu, nr, 1, error_code); + CPUSET_DEL(cpus_hardware_enabled, cpu); + kvm_arch_hardware_disable(NULL); } -inline void -kvm_clear_exception_queue(struct kvm_vcpu *vcpu) -{ - vcpu->arch.exception.pending = 0; -} +/* + * The following needs to run on each cpu. Currently, + * wait is always 1, so we use the kvm_xcall() routine which + * calls xc_sync. Later, if needed, the implementation can be + * changed to use xc_call or xc_call_nowait. + */ +#define on_each_cpu(func, info, wait) \ + /*CSTYLED*/ \ + ({ \ + kvm_xcall(KVM_CPUALL, func, info); \ + 0; \ + }) -inline void -kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +static void +hardware_disable_all_nolock(void) { - vcpu->arch.interrupt.pending = 0; + kvm_usage_count--; + if (!kvm_usage_count) + on_each_cpu(hardware_disable, NULL, 1); } - -void -shared_msr_update(unsigned slot, uint32_t msr) +static void +hardware_disable_all(void) { - struct kvm_shared_msrs *smsr; - uint64_t value; - smsr = shared_msrs[CPU->cpu_id]; - - /* - * only read, and nobody should modify it at this time, - * so don't need lock - */ - if (slot >= shared_msrs_global.nr) { - cmn_err(CE_WARN, "kvm: invalid MSR slot!"); - return; - } - - rdmsrl_safe(msr, (unsigned long long *)&value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; + mutex_enter(&kvm_lock); + hardware_disable_all_nolock(); + mutex_exit(&kvm_lock); } -void -kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value, - uint64_t mask) +static int +hardware_enable_all(void) { - struct kvm_shared_msrs *smsr = shared_msrs[CPU->cpu_id]; - - if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; - - smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); + int r = 0; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - kvm_user_return_notifier_register(vcpu, &smsr->urn); - smsr->registered = 1; - } -} + mutex_enter(&kvm_lock); -int -kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - return (kvm_x86_ops->interrupt_allowed(vcpu)); -} + kvm_usage_count++; + if (kvm_usage_count == 1) { + hardware_enable_failed = 0; + on_each_cpu(hardware_enable, NULL, 1); -static int -kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, uint32_t access, uint32_t *error) -{ - uintptr_t data = (uintptr_t)val; - int r = 0; /* X86EMUL_CONTINUE */ - - while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, - access, error); - unsigned offset = addr & (PAGESIZE-1); - unsigned toread = min(bytes, (unsigned)PAGESIZE - offset); - int ret; - - if (gpa == UNMAPPED_GVA) { - r = 1; /* X86EMUL_PROPAGATE_FAULT */ - goto out; - } - ret = kvm_read_guest(vcpu->kvm, gpa, (void *)data, toread); - if (ret < 0) { - r = 1; /* X86EMUL_UNHANDLEABLE */ - goto out; + if (hardware_enable_failed) { + hardware_disable_all_nolock(); + r = EBUSY; } - - bytes -= toread; - data += toread; - addr += toread; } -out: - return (r); -} - -void -kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - uint32_t error_code) -{ - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest); - vcpu->arch.cr2 = addr; - kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); -} -static int -kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, uint32_t *error) -{ - return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error)); -} - -/* used for instruction fetching */ -static int -kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, uint32_t *error) -{ - uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? - PFERR_USER_MASK : 0; + mutex_exit(&kvm_lock); - return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, error)); + return (r); } /* kvm_io_bus_write - called under kvm->slots_lock */ @@ -3530,2596 +1437,398 @@ kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, return (-EOPNOTSUPP); } -static int -vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) -{ - if (vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) - return (0); - - return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); -} - -static int -vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) -{ - if (vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) - return (0); - - return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); -} - -gpa_t -kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) -{ - uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? - PFERR_USER_MASK : 0; - - return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); -} - -static int -kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, uint32_t *error) -{ - uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? - PFERR_USER_MASK : 0; - - return (kvm_read_guest_virt_helper(addr, val, - bytes, vcpu, access, error)); -} - -static int -emulator_read_emulated(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - uint32_t error_code; - - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, - vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val); - - vcpu->mmio_read_completed = 0; - return (X86EMUL_CONTINUE); - } - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); - - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); - return (X86EMUL_PROPAGATE_FAULT); - } - - /* For APIC access vmexit */ - if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; - - if (kvm_read_guest_virt(addr, val, - bytes, vcpu, NULL) == X86EMUL_CONTINUE) - return (X86EMUL_CONTINUE); - -mmio: - /* - * Is this MMIO handled locally? - */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { - KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa, - uint64_t, *(uint64_t *)val); - return (X86EMUL_CONTINUE); - } - - KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes, - uintptr_t, gpa); - - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; - - return (X86EMUL_UNHANDLEABLE); -} - -int -emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) -{ - int ret; - - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - - if (ret < 0) - return (0); - - kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); - - return (1); -} - -gpa_t -kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) -{ - uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? - PFERR_USER_MASK : 0; - - access |= PFERR_WRITE_MASK; - - return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); -} - -static int -emulator_write_emulated_onepage(unsigned long addr, const void *val, - unsigned int bytes, struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - uint32_t error_code; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); - - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); - return (X86EMUL_PROPAGATE_FAULT); - } - - /* For APIC access vmexit */ - if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; - - if (emulator_write_phys(vcpu, gpa, val, bytes)) - return (X86EMUL_CONTINUE); - -mmio: - KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa, - uint64_t, *(uint64_t *)val); - - /* - * Is this MMIO handled locally? - */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) - return (X86EMUL_CONTINUE); - - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); - - return (X86EMUL_CONTINUE); -} - +/* Caller must hold slots_lock. */ int -emulator_write_emulated(unsigned long addr, const void *val, - unsigned int bytes, struct kvm_vcpu *vcpu) +kvm_io_bus_register_dev(struct kvm *kvm, + enum kvm_bus bus_idx, struct kvm_io_device *dev) { - uintptr_t data = (uintptr_t)val; - - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGEMASK) { - int rc, now; - - now = -addr & ~PAGEMASK; - rc = emulator_write_emulated_onepage(addr, - (void *)data, now, vcpu); - - if (rc != X86EMUL_CONTINUE) - return (rc); - - addr += now; - data += now; - bytes -= now; - } - - return (emulator_write_emulated_onepage(addr, val, bytes, vcpu)); -} - -static int -emulator_cmpxchg_emulated(unsigned long addr, const void *old, - const void *new, unsigned int bytes, struct kvm_vcpu *vcpu) -{ - cmn_err(CE_WARN, "kvm: emulating exchange as write\n"); -#ifndef CONFIG_X86_64 - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes == 8) { - gpa_t gpa; - page_t page; - char *kaddr; - uint64_t val; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - - if (gpa == UNMAPPED_GVA || - (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; + struct kvm_io_bus *new_bus, *bus; - if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK)) - goto emul_write; - - val = *(uint64_t *)new; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT); - kaddr = kmap_atomic(page, KM_USER0); - - set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val); - kunmap_atomic(kaddr, KM_USER0); - kvm_release_page_dirty(page); - } -emul_write: -#endif - - return (emulator_write_emulated(addr, new, bytes, vcpu)); -} - -static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt_system, - .fetch = kvm_fetch_guest_virt, - .read_emulated = emulator_read_emulated, - .write_emulated = emulator_write_emulated, - .cmpxchg_emulated = emulator_cmpxchg_emulated, -}; - -static void -cache_all_regs(struct kvm_vcpu *vcpu) -{ - kvm_register_read(vcpu, VCPU_REGS_RAX); - kvm_register_read(vcpu, VCPU_REGS_RSP); - kvm_register_read(vcpu, VCPU_REGS_RIP); - vcpu->arch.regs_dirty = ~0; -} - -static unsigned long -get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - return (kvm_x86_ops->get_segment_base(vcpu, seg)); -} - -void -kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -{ - uint8_t opcodes[4]; - unsigned long rip = kvm_rip_read(vcpu); - unsigned long rip_linear; + bus = kvm->buses[bus_idx]; + if (bus->dev_count > NR_IOBUS_DEVS-1) + return (-ENOSPC); + new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); + if (!new_bus) + return (-ENOMEM); + memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; #ifdef XXX - if (!printk_ratelimit()) - return; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); #else XXX_KVM_PROBE; + kvm->buses[bus_idx] = new_bus; #endif + if (bus) + kmem_free(bus, sizeof (struct kvm_io_bus)); - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - - cmn_err(CE_WARN, "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + return (0); } +/* Caller must hold slots_lock. */ int -emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - uint16_t error_code, int emulation_type) +kvm_io_bus_unregister_dev(struct kvm *kvm, + enum kvm_bus bus_idx, struct kvm_io_device *dev) { - int r, shadow_mask; - struct decode_cache *c; - struct kvm_run *run = vcpu->run; + int i, r; + struct kvm_io_bus *new_bus, *bus; - kvm_clear_exception_queue(vcpu); - vcpu->arch.mmio_fault_cr2 = cr2; + new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); + if (!new_bus) + return (-ENOMEM); - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); - - vcpu->mmio_is_write = 0; - vcpu->arch.pio.string = 0; - - if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); - vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? - X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ? - X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : - cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); - /* - * Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall) - */ - c = &vcpu->arch.emulate_ctxt.decode; - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return (EMULATE_FAIL); - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return (EMULATE_FAIL); - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return (EMULATE_FAIL); - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return (EMULATE_FAIL); - break; - default: - return (EMULATE_FAIL); - } - - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return (EMULATE_FAIL); - } - - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation); - - if (r) { - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail); - - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return (EMULATE_DONE); - return (EMULATE_FAIL); + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) { + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + break; } } - if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); - return (EMULATE_DONE); - } - - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - - if (r == 0) - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); - - if (vcpu->arch.pio.string) - return (EMULATE_DO_MMIO); - - if ((r || vcpu->mmio_is_write) && run) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return (EMULATE_DONE); - if (!vcpu->mmio_needed) { - kvm_report_emulation_failure(vcpu, "mmio"); - return (EMULATE_FAIL); - } - - return (EMULATE_DO_MMIO); - } - - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return (EMULATE_DO_MMIO); - } - - return (EMULATE_DONE); -} - -int -kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || - vcpu->arch.nmi_injected); -} - -int -kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits); - - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - return (1); - } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; - return (0); - } -} - -static int -kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, uint32_t *error) -{ - uintptr_t data = (uintptr_t)val; - - while (bytes) { - gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); - unsigned offset = addr & (PAGESIZE-1); - unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset); - int ret; - - if (gpa == UNMAPPED_GVA) - return (X86EMUL_PROPAGATE_FAULT); - - if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0) - return (X86EMUL_UNHANDLEABLE); - - bytes -= towrite; - data += towrite; - addr += towrite; - } - - return (0); -} - -static int -pio_copy_data(struct kvm_vcpu *vcpu) -{ - void *p = vcpu->arch.pio_data; - gva_t q = vcpu->arch.pio.guest_gva; - unsigned bytes; - int ret; - uint32_t error_code; - - bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; - - if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); - else - ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); - - if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(vcpu, q, error_code); - - return (ret); -} - -int -complete_pio(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - long delta; - int r; - unsigned long val; - - if (!io->string) { - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(&val, vcpu->arch.pio_data, io->size); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); - } - } else { - if (io->in) { - r = pio_copy_data(vcpu); - if (r) - goto out; - } - - delta = 1; - if (io->rep) { - delta *= io->cur_count; - /* - * The size of the register should really depend on - * current address size. - */ - val = kvm_register_read(vcpu, VCPU_REGS_RCX); - val -= delta; - kvm_register_write(vcpu, VCPU_REGS_RCX, val); - } - if (io->down) - delta = -delta; - delta *= io->size; - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RDI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RDI, val); - } else { - val = kvm_register_read(vcpu, VCPU_REGS_RSI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RSI, val); - } - } -out: - io->count -= io->cur_count; - io->cur_count = 0; - - return (0); -} - -static int -kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) { - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - } else { - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, pd); - } - - return (r); -} - -int -kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) -{ - unsigned long val; - - DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size, - unsigned long, 1) - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE; - vcpu->run->io.count = vcpu->arch.pio.count = - vcpu->arch.pio.cur_count = 1; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 0; - vcpu->arch.pio.down = 0; - vcpu->arch.pio.rep = 0; - - if (!vcpu->arch.pio.in) { - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); - } - - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - complete_pio(vcpu); - return (1); - } - - return (0); -} - - -void -kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - cr0 |= X86_CR0_ET; - -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) { - kvm_inject_gp(vcpu, 0); - return; - } -#endif - - cr0 &= ~CR0_RESERVED_BITS; - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - kvm_inject_gp(vcpu, 0); - return; - } - - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - kvm_inject_gp(vcpu, 0); - return; + kmem_free(new_bus, sizeof (struct kvm_io_bus)); + return (r); } - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { -#ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { - int cs_db, cs_l; - - if (!is_pae(vcpu)) { - kvm_inject_gp(vcpu, 0); - return; - } - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - kvm_inject_gp(vcpu, 0); - return; - - } - } else +#ifdef XXX + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; + kvm->buses[bus_idx] = new_bus; #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - - } - - kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->arch.cr0 = cr0; - kvm_mmu_reset_context(vcpu); -} - -static int -pdptrs_changed(struct kvm_vcpu *vcpu) -{ - uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - - if (is_long_mode(vcpu) || !is_pae(vcpu)) - return (0); - - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) { - return (1); - } - - if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, - pdpte, sizeof (pdpte)) < 0) - return (1); - - return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0); -} - -void -kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - return; - } - - if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } - - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT))) - kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } -} - -void -kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - - if (cr4 & CR4_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - - if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - kvm_inject_gp(vcpu, 0); - return; - } - } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && - ((cr4 ^ old_cr4) & pdptr_bits) && - !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - - if (cr4 & X86_CR4_VMXE) { - kvm_inject_gp(vcpu, 0); - return; - } - - kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; - vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; - kvm_mmu_reset_context(vcpu); -} - -void -kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) -{ - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); -} - -/* - * Checks if cpl <= required_cpl; if true, return true. Otherwise queue - * a #GP and return false. - */ -int -kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) -{ - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) - return (1); - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return (0); -} - -void -kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - uint32_t function, index; - struct kvm_cpuid_entry2 *best; - - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = kvm_find_cpuid_entry(vcpu, function, index); - if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } - kvm_x86_ops->skip_emulated_instruction(vcpu); - - KVM_TRACE5(cpuid, uint32_t, function, - uint32_t, kvm_register_read(vcpu, VCPU_REGS_RAX), - uint32_t, kvm_register_read(vcpu, VCPU_REGS_RBX), - uint32_t, kvm_register_read(vcpu, VCPU_REGS_RCX), - uint32_t, kvm_register_read(vcpu, VCPU_REGS_RDX)); -} - -static int -kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE); + kmem_free(bus, sizeof (struct kvm_io_bus)); + return (r); } int -kvm_hv_hypercall(struct kvm_vcpu *vcpu) +kvm_init(void *opaque) { - uint64_t param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - int fast, longmode; - int cs_db, cs_l; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return (0); - } - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - longmode = is_long_mode(vcpu) && cs_l == 1; - - if (!longmode) { - param = ((uint64_t)kvm_register_read(vcpu, - VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu, - VCPU_REGS_RAX) & 0xffffffff); - - ingpa = ((uint64_t)kvm_register_read(vcpu, - VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu, - VCPU_REGS_RCX) & 0xffffffff); + int r; + int cpu; - outgpa = ((uint64_t)kvm_register_read(vcpu, - VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu, - VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif + r = kvm_arch_init(opaque); - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; + if (r != DDI_SUCCESS) + return (r); - KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast, - uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa, - uintptr_t, outgpa); + bad_page = alloc_page(PAGESIZE, KM_SLEEP); + bad_pfn = bad_page->p_pagenum; - switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: #ifdef XXX - kvm_vcpu_on_spin(vcpu); -#else - XXX_KVM_PROBE; -#endif - break; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - - ret = res | (((uint64_t)rep_done & 0xfff) << 32); - - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { + r = -ENOMEM; + goto out_free_0; } - - return (1); -} - -/* Return values for hypercalls */ -#define KVM_ENOSYS 1000 -#define KVM_EFAULT EFAULT -#define KVM_E2BIG E2BIG -#define KVM_EPERM EPERM - -#define KVM_HC_VAPIC_POLL_IRQ 1 -#define KVM_HC_MMU_OP 2 - -/* - * hypercalls use architecture specific - */ - -#ifdef _KERNEL -#ifdef CONFIG_KVM_GUEST -void __init kvm_guest_init(void); -#else -#define kvm_guest_init() do { } while (0) -#endif - -static unsigned int -kvm_arch_para_features(void) -{ -#ifdef XXX - return (cpuid_eax(KVM_CPUID_FEATURES)); #else XXX_KVM_PROBE; - return (0); #endif -} - -static inline int -kvm_para_has_feature(unsigned int feature) -{ - if (kvm_arch_para_features() & (1UL << feature)) - return (1); - return (0); -} -#endif /* _KERNEL */ - -int -kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - int r = 1; - - if (kvm_hv_hypercall_enabled(vcpu->kvm)) - return (kvm_hv_hypercall(vcpu)); - - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - - KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1, - uintptr_t, a2, uintptr_t, a3); - - if (!is_long_mode(vcpu)) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } + r = kvm_arch_hardware_setup(); - if (kvm_x86_ops->get_cpl(vcpu) != 0) { - ret = -EPERM; - goto out; - } + if (r != DDI_SUCCESS) + goto out_free_0a; - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - case KVM_HC_MMU_OP: #ifdef XXX - r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_arch_check_processor_compat, + &r, 1); + if (r < 0) + goto out_free_1; + } #else - XXX_KVM_PROBE; - ret = -ENOSYS; + r = 0; + kvm_xcall(KVM_CPUALL, kvm_arch_check_processor_compat, &r); + if (r < 0) + goto out_free_1; + XXX_KVM_PROBE; #endif - break; - default: - ret = -ENOSYS; - break; - } -out: - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls); - - return (r); -} - -static int -is_vm86_segment(struct kvm_vcpu *vcpu, int seg) -{ - return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) && - (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); -} -static void -seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector, - struct kvm_segment *kvm_desct) -{ - kvm_desct->base = get_desc_base(seg_desc); - kvm_desct->limit = get_desc_limit(seg_desc); - if (seg_desc->c.b.g) { - kvm_desct->limit <<= 12; - kvm_desct->limit |= 0xfff; - } - kvm_desct->selector = selector; - kvm_desct->type = seg_desc->c.b.type; - kvm_desct->present = seg_desc->c.b.p; - kvm_desct->dpl = seg_desc->c.b.dpl; - kvm_desct->db = seg_desc->c.b.d; - kvm_desct->s = seg_desc->c.b.s; - kvm_desct->l = seg_desc->c.b.l; - kvm_desct->g = seg_desc->c.b.g; - kvm_desct->avl = seg_desc->c.b.avl; - if (!selector) - kvm_desct->unusable = 1; - else - kvm_desct->unusable = 0; - kvm_desct->padding = 0; -} -static int -kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg) -{ - struct kvm_segment segvar = { - .base = selector << 4, - .limit = 0xffff, - .selector = selector, - .type = 3, - .present = 1, - .dpl = 3, - .db = 0, - .s = 1, - .l = 0, - .g = 0, - .avl = 0, - .unusable = 0, - }; - kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return (0); -} - -static void -get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, uint16_t selector, - struct descriptor_table *dtable) -{ - if (selector & 1 << 2) { - struct kvm_segment kvm_seg; - - kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); - - if (kvm_seg.unusable) - dtable->limit = 0; - else - dtable->limit = kvm_seg.limit; - dtable->base = kvm_seg.base; - } else - kvm_x86_ops->get_gdt(vcpu, dtable); -} - -/* allowed just for 8 bytes segments */ -static int -load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - uint16_t index = selector >> 3; - int ret; - uint32_t err; - gva_t addr; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) { - kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return (1); - } - - addr = dtable.base + index * 8; - ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof (*seg_desc), - vcpu, &err); - - if (ret == 1) - kvm_inject_page_fault(vcpu, addr, err); - - return (ret); -} - -/* allowed just for 8 bytes segments */ -static int -save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - uint16_t index = selector >> 3; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) - return (1); - - return kvm_write_guest_virt(dtable.base + index * 8, seg_desc, - sizeof (*seg_desc), vcpu, NULL); -} - -int -kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg) -{ - struct kvm_segment kvm_seg; - struct desc_struct seg_desc; - uint8_t dpl, rpl, cpl; - unsigned err_vec = GP_VECTOR; - uint32_t err_code = 0; - int null_selector = !(selector & ~0x3); /* 0000-0003 are null */ - int ret; - - if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) - return (kvm_load_realmode_segment(vcpu, selector, seg)); - - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || - seg == VCPU_SREG_TR) && null_selector) - goto exception; - - /* TR should be in GDT only */ - if (seg == VCPU_SREG_TR && (selector & (1 << 2))) - goto exception; - - ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); - - if (ret) - return (ret); - - seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); - - if (null_selector) { /* for NULL selector skip all following checks */ - kvm_seg.unusable = 1; - goto load; - } - - err_code = selector & 0xfffc; - err_vec = GP_VECTOR; - - /* can't load system descriptor into segment selecor */ - if (seg <= VCPU_SREG_GS && !kvm_seg.s) - goto exception; - - if (!kvm_seg.present) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } - - rpl = selector & 3; - dpl = kvm_seg.dpl; - cpl = kvm_x86_ops->get_cpl(vcpu); - - switch (seg) { - case VCPU_SREG_SS: - /* - * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL - */ - if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) - goto exception; - break; - case VCPU_SREG_CS: - if (!(kvm_seg.type & 8)) - goto exception; - - if (kvm_seg.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) - goto exception; - } - /* CS(RPL) <- CPL */ - selector = (selector & 0xfffc) | cpl; - break; - case VCPU_SREG_TR: - if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) - goto exception; - break; - case VCPU_SREG_LDTR: - if (kvm_seg.s || kvm_seg.type != 2) - goto exception; - break; - default: /* DS, ES, FS, or GS */ - /* - * segment is not a data or readable code segment or - * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) - */ - if ((kvm_seg.type & 0xa) == 0x8 || - (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) - goto exception; - break; - } - - if (!kvm_seg.unusable && kvm_seg.s) { - /* mark segment as accessed */ - kvm_seg.type |= 1; - seg_desc.c.b.type |= 1; - save_guest_segment_descriptor(vcpu, selector, &seg_desc); - } -load: - kvm_set_segment(vcpu, &kvm_seg, seg); - return (0); -exception: - kvm_queue_exception_e(vcpu, err_vec, err_code); - return (1); - -} - -static void -save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) -{ - tss->cr3 = vcpu->arch.cr3; - tss->eip = kvm_rip_read(vcpu); - tss->eflags = kvm_get_rflags(vcpu); - tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); - tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static void -kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg) -{ - struct kvm_segment kvm_seg; - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); -} - -static int -load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) -{ - kvm_set_cr3(vcpu, tss->cr3); - - kvm_rip_write(vcpu, tss->eip); - kvm_set_rflags(vcpu, tss->eflags | 2); - - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); - kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); - kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); - kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); - kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); - kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); - kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - if (kvm_load_segment_descriptor(vcpu, - tss->ldt_selector, VCPU_SREG_LDTR)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) - return (1); - - return (0); -} - -static void -save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) -{ - tss->ip = kvm_rip_read(vcpu); - tss->flag = kvm_get_rflags(vcpu); - tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); - - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static int -load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) -{ - kvm_rip_write(vcpu, tss->ip); - kvm_set_rflags(vcpu, tss->flag | 2); - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); - kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); - kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); - kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); - kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) - return (1); - - if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) - return (1); - - return (0); -} - -int -kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) -{ - gfn_t gfn = gpa >> PAGESHIFT; - int seg; - int offset = offset_in_page(gpa); - int ret; - uintptr_t dp = (uintptr_t)data; - - while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_read_guest_page(kvm, gfn, (void *)dp, offset, seg); - if (ret < 0) - return (ret); - offset = 0; - len -= seg; - dp += seg; - ++gfn; - } - return (0); -} - -static gpa_t -get_tss_base_addr_write(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) -{ - uint32_t base_addr = get_desc_base(seg_desc); - - return (kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL)); -} - -static gpa_t -get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) -{ - uint32_t base_addr = get_desc_base(seg_desc); - - return (kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL)); -} - -static int -kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector, - uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) -{ - struct tss_segment_16 tss_segment_16; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, - &tss_segment_16, sizeof (tss_segment_16))) - goto out; - - save_state_to_tss16(vcpu, &tss_segment_16); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, - &tss_segment_16, sizeof (tss_segment_16))) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), - &tss_segment_16, sizeof (tss_segment_16))) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_16.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, - nseg_desc), &tss_segment_16.prev_task_link, - sizeof (tss_segment_16.prev_task_link))) - goto out; - } - - if (load_state_from_tss16(vcpu, &tss_segment_16)) - goto out; - - ret = 1; -out: - return (ret); -} - -static int -kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector, - uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) -{ - struct tss_segment_32 tss_segment_32; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, - &tss_segment_32, sizeof (tss_segment_32))) - goto out; - - save_state_to_tss32(vcpu, &tss_segment_32); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, - &tss_segment_32, sizeof (tss_segment_32))) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), - &tss_segment_32, sizeof (tss_segment_32))) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_32.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, - nseg_desc), &tss_segment_32.prev_task_link, - sizeof (tss_segment_32.prev_task_link))) - goto out; - } - - if (load_state_from_tss32(vcpu, &tss_segment_32)) - goto out; - - ret = 1; -out: - return (ret); -} - -int -kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason) -{ - struct kvm_segment tr_seg; - struct desc_struct cseg_desc; - struct desc_struct nseg_desc; - int ret = 0; - uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); - uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - uint32_t desc_limit; - - old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); - - /* - * FIXME: Handle errors. Failure to read either TSS or their - * descriptors should generate a pagefault. - */ - if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) - goto out; - - if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) - goto out; - - if (reason != TASK_SWITCH_IRET) { - int cpl; - - cpl = kvm_x86_ops->get_cpl(vcpu); - if ((tss_selector & 3) > nseg_desc.c.b.dpl || - cpl > nseg_desc.c.b.dpl) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return (1); - } - } - - desc_limit = get_desc_limit(&nseg_desc); - - if (!nseg_desc.c.b.p || ((desc_limit < 0x67 && - (nseg_desc.c.b.type & 8)) || desc_limit < 0x2b)) { - kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); - return (1); - } - - if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - cseg_desc.c.b.type &= ~(1 << 1); // clear the B flag - save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); - } - - if (reason == TASK_SWITCH_IRET) { - uint32_t eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); - } - - /* - * set back link to prev task only if NT bit is set in eflags - * note that old_tss_sel is not used afetr this point - */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; - - if (nseg_desc.c.b.type & 8) { - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); - } else { - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); - } - - if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { - uint32_t eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); - } - - if (reason != TASK_SWITCH_IRET) { - nseg_desc.c.b.type |= (1 << 1); - save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc); - } +#ifdef XXX + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_2; + register_reboot_notifier(&kvm_reboot_notifier); - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); - seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); - tr_seg.type = 11; - kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); -out: - return (ret); -} + r = sysdev_class_register(&kvm_sysdev_class); + if (r) + goto out_free_3; -void -kvm_guest_exit(void) -{ -#ifdef XXX - account_system_vtime(current); - current->flags &= ~PF_VCPU; + r = sysdev_register(&kvm_sysdev); + if (r) + goto out_free_4; #else XXX_KVM_PROBE; #endif -} -void -kvm_guest_enter(void) -{ #ifdef XXX - account_system_vtime(current); - current->flags |= PF_VCPU; -#else - XXX_KVM_PROBE; -#endif -} - -/* - * Often times we have pages that correspond to addresses that are in a users - * virtual address space. Rather than trying to constantly map them in and out - * of our address space we instead go through and use the kpm segment to - * facilitate this for us. This always returns an address that is always in the - * kernel's virtual address space. - */ -caddr_t -page_address(page_t *page) -{ - return (hat_kpm_mapin_pfn(page->p_pagenum)); -} - -static void -inject_pending_event(struct kvm_vcpu *vcpu) -{ - /* try to reinject previous events if any */ - if (vcpu->arch.exception.pending) { - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - return; - } - - if (vcpu->arch.nmi_injected) { - kvm_x86_ops->set_nmi(vcpu); - return; - } - - if (vcpu->arch.interrupt.pending) { - kvm_x86_ops->set_irq(vcpu); - return; - } - - /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending) { - if (kvm_x86_ops->nmi_allowed(vcpu)) { - vcpu->arch.nmi_pending = 0; - vcpu->arch.nmi_injected = 1; - kvm_x86_ops->set_nmi(vcpu); - } - } else if (kvm_cpu_has_interrupt(vcpu)) { - if (kvm_x86_ops->interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - 0); - kvm_x86_ops->set_irq(vcpu); - } - } -} - -void -kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_restore(&vcpu->arch.guest_fx_image); - KVM_TRACE1(fpu, int, 1); -} - -static inline unsigned long -native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - __asm__("mov %%db0, %0" :"=r" (val)); - break; - case 1: - __asm__("mov %%db1, %0" :"=r" (val)); - break; - case 2: - __asm__("mov %%db2, %0" :"=r" (val)); - break; - case 3: - __asm__("mov %%db3, %0" :"=r" (val)); - break; - case 6: - __asm__("mov %%db6, %0" :"=r" (val)); - break; - case 7: - __asm__("mov %%db7, %0" :"=r" (val)); - break; - default: - cmn_err(CE_WARN, "kvm: invalid debug register retrieval, " - "regno = %d\n", regno); - } - - return (val); -} + kvm_chardev_ops.owner = module; + kvm_vm_fops.owner = module; + kvm_vcpu_fops.owner = module; -static inline void -native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - __asm__("mov %0, %%db0" ::"r" (value)); - break; - case 1: - __asm__("mov %0, %%db1" ::"r" (value)); - break; - case 2: - __asm__("mov %0, %%db2" ::"r" (value)); - break; - case 3: - __asm__("mov %0, %%db3" ::"r" (value)); - break; - case 6: - __asm__("mov %0, %%db6" ::"r" (value)); - break; - case 7: - __asm__("mov %0, %%db7" ::"r" (value)); - break; - default: - cmn_err(CE_WARN, "kvm: invalid debug register set, " - "regno = %d\n", regno); + r = misc_register(&kvm_dev); + if (r) { + cmn_err(CE_WARN, "kvm: misc device register failed\n"); + goto out_free; } -} - -static uint32_t -div_frac(uint32_t dividend, uint32_t divisor) -{ - uint32_t quotient, remainder; /* - * Don't try to replace with do_div(), this one calculates - * "(dividend << 32) / divisor" + * XXX - if kernel preemption occurs, we probably need + * to implement these, and add hooks to the preemption code. + * For right now, we'll make the totally unreasonable + * assumption that we won't be preempted while in the + * kernel, i.e., no realtime threads are running */ - __asm__("divl %4" - : "=a" (quotient), "=d" (remainder) - : "0" (0), "1" (dividend), "r" (divisor)); - - return (quotient); -} - -static void -kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) -{ - uint64_t nsecs = 1000000000LL; - int32_t shift = 0; - uint64_t tps64; - uint32_t tps32; - - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { - tps64 >>= 1; - shift--; - } - - tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; - shift++; - } - - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); -} - -static void -kvm_write_guest_time(struct kvm_vcpu *v) -{ - struct timespec ts; - unsigned long flags; - struct kvm_vcpu_arch *vcpu = &v->arch; - void *shared_kaddr; - unsigned long this_tsc_khz; - - if ((!vcpu->time_page)) - return; - - this_tsc_khz = cpu_tsc_khz; - if (vcpu->hv_clock_tsc_khz != this_tsc_khz) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; - } - -#ifdef XXX - put_cpu_var(cpu_tsc_khz); -#else - XXX_KVM_PROBE; -#endif + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; -#ifdef XXX - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); -#else - /* - * may need to mask interrupts for local_irq_save, and unmask - * for local_irq_restore. cli()/sti() might be done... - */ - XXX_KVM_PROBE; -#endif - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - gethrestime(&ts); -#ifdef XXX - monotonic_to_bootbased(&ts); - local_irq_restore(flags); + kvm_init_debug(); #else XXX_KVM_PROBE; #endif - /* With all the info we got, fill in the values */ - - vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * - (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset; - - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. - */ - vcpu->hv_clock.version += 2; - - shared_kaddr = page_address(vcpu->time_page); - - memcpy((void *)((uintptr_t)shared_kaddr + vcpu->time_offset), - &vcpu->hv_clock, sizeof (vcpu->hv_clock)); - - mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT); -} - -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = native_get_debugreg(register) -#define set_debugreg(value, register) \ - native_set_debugreg(register, value) - -static int -vcpu_enter_guest(struct kvm_vcpu *vcpu) -{ - int r; - - int req_int_win = !irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window; - - if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) - kvm_mmu_unload(vcpu); - } - - r = kvm_mmu_reload(vcpu); - - if (r) - goto out; - - if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, - &vcpu->requests)) { - __kvm_migrate_timers(vcpu); - } - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, - &vcpu->requests)) { - kvm_write_guest_time(vcpu); - } - - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) - kvm_mmu_sync_roots(vcpu); - - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); - - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { - vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; - r = 0; - goto out; - } - - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - r = 0; - goto out; - } - - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, - &vcpu->requests)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } - } - - kpreempt_disable(); - - kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); - - cli(); + return (0); - clear_bit(KVM_REQ_KICK, &vcpu->requests); +out_free: +out_free_5: #ifdef XXX - smp_mb__after_clear_bit(); + sysdev_unregister(&kvm_sysdev); +out_free_4: + sysdev_class_unregister(&kvm_sysdev_class); +out_free_3: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); #else XXX_KVM_PROBE; #endif - - if (vcpu->requests || issig(JUSTLOOKING)) { - set_bit(KVM_REQ_KICK, &vcpu->requests); - sti(); - kpreempt_enable(); - r = 1; - goto out; - } - - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } +out_free_2: +out_free_1: #ifdef XXX - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_arch_hardware_unsetup(); #else XXX_KVM_PROBE; #endif - kvm_guest_enter(); - - if (vcpu->arch.switch_db_regs) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.eff_db[0], 0); - set_debugreg(vcpu->arch.eff_db[1], 1); - set_debugreg(vcpu->arch.eff_db[2], 2); - set_debugreg(vcpu->arch.eff_db[3], 3); - } - - KVM_TRACE1(vm__entry, int, vcpu->vcpu_id); - - kvm_x86_ops->run(vcpu); +out_free_0a: #ifdef XXX - /* - * If the guest has used debug registers, at least dr7 - * will be disabled while returning to the host. - * If we don't have active breakpoints in the host, we don't - * care about the messed up debug address registers. But if - * we have some of them active, restore the old state. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); + free_cpumask_var(cpus_hardware_enabled); #else XXX_KVM_PROBE; #endif - set_bit(KVM_REQ_KICK, &vcpu->requests); - - sti(); - +out_free_0: #ifdef XXX - local_irq_enable(); /* XXX - should be ok with kpreempt_enable below */ - - barrier(); + free_page(bad_page, PAGESIZE); #else XXX_KVM_PROBE; #endif - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_exits); - kvm_guest_exit(); - - kpreempt_enable(); +out: #ifdef XXX - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - unsigned long rip = kvm_rip_read(vcpu); - profile_hit(KVM_PROFILING, (void *)rip); - } + kvm_arch_exit(); #else XXX_KVM_PROBE; #endif - kvm_lapic_sync_from_vapic(vcpu); - r = kvm_x86_ops->handle_exit(vcpu); - -out: +out_fail: return (r); } -static void -post_kvm_run_save(struct kvm_vcpu *vcpu) -{ - struct kvm_run *kvm_run = vcpu->run; - - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = kvm_get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else - kvm_run->ready_for_interrupt_injection = - kvm_arch_interrupt_allowed(vcpu) && - !kvm_cpu_has_interrupt(vcpu) && - !kvm_event_needs_reinjection(vcpu); -} -/* - * The vCPU has executed a HLT instruction with in-kernel mode enabled. - */ void -kvm_vcpu_block(struct kvm_vcpu *vcpu) +kvm_guest_exit(void) { - for (;;) { - if (kvm_arch_vcpu_runnable(vcpu)) { - set_bit(KVM_REQ_UNHALT, &vcpu->requests); - break; - } - - if (issig(JUSTLOOKING)) - break; - - mutex_enter(&vcpu->kvcpu_kick_lock); - - if (kvm_cpu_has_pending_timer(vcpu)) { - mutex_exit(&vcpu->kvcpu_kick_lock); - break; - } - - (void) cv_wait_sig_swap(&vcpu->kvcpu_kick_cv, - &vcpu->kvcpu_kick_lock); - - mutex_exit(&vcpu->kvcpu_kick_lock); - } +#ifdef XXX + account_system_vtime(current); + current->flags &= ~PF_VCPU; +#else + XXX_KVM_PROBE; +#endif } void -kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - processorid_t cpu = vcpu->cpu; - - mutex_enter(&vcpu->kvcpu_kick_lock); - - if (CV_HAS_WAITERS(&vcpu->kvcpu_kick_cv)) - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_wakeup); - - cv_broadcast(&vcpu->kvcpu_kick_cv); - mutex_exit(&vcpu->kvcpu_kick_lock); - - if (cpu != CPU->cpu_id && cpu != -1) { - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) { - /* - * If we haven't already kicked this VCPU, we'll poke - * the the CPU on which it's running. (This will serve - * to induce a VM exit.) - */ - poke_cpu(cpu); - } - } -} - -static void -vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - page_t *page; - - if (!apic || !apic->vapic_addr) - return; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); - - vcpu->arch.apic->vapic_page = page; -} - -static void -vapic_exit(struct kvm_vcpu *vcpu) +kvm_guest_enter(void) { - struct kvm_lapic *apic = vcpu->arch.apic; - int idx; - - if (!apic || !apic->vapic_addr) - return; -#ifdef XXX - idx = srcu_read_lock(&vcpu->kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); #ifdef XXX - srcu_read_unlock(&vcpu->kvm->srcu, idx); + account_system_vtime(current); + current->flags |= PF_VCPU; #else - XXX_KVM_SYNC_PROBE; + XXX_KVM_PROBE; #endif } static int -dm_request_for_irq_injection(struct kvm_vcpu *vcpu) -{ - return (!irqchip_in_kernel(vcpu->kvm) && - !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); -} - -static int -__vcpu_run(struct kvm_vcpu *vcpu) +kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { - int r; - struct kvm *kvm = vcpu->kvm; + minor_t instance; - if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r) - return (r); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (kpm_enable == 0) { + cmn_err(CE_WARN, "kvm: kpm_enable must be true\n"); + return (DDI_FAILURE); } -#ifdef XXX - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - vapic_enter(vcpu); - - r = 1; - while (r > 0) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - r = vcpu_enter_guest(vcpu); - else { -#ifdef XXX - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); -#else - XXX_KVM_SYNC_PROBE; -#endif - kvm_vcpu_block(vcpu); -#ifdef XXX - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - if (test_and_clear_bit(KVM_REQ_UNHALT, - &vcpu->requests)) { - switch (vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - break; - case KVM_MP_STATE_SIPI_RECEIVED: - default: - r = -EINTR; - break; - } - } - } - if (r <= 0) - break; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - if (kvm_cpu_has_pending_timer(vcpu)) - kvm_inject_pending_timer_irqs(vcpu); + if (kvm_dip != NULL) + return (DDI_FAILURE); - if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_irq_exits); - } + if (ddi_soft_state_init(&kvm_state, sizeof (kvm_devstate_t), 1) != 0) + return (DDI_FAILURE); - if (issig(JUSTLOOKING)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - KVM_VCPU_KSTAT_INC(vcpu, kvmvs_signal_exits); - } + instance = ddi_get_instance(dip); + if (ddi_create_minor_node(dip, "kvm", + S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_soft_state_fini(&kvm_state); + return (DDI_FAILURE); } -#ifdef XXX - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); -#else - XXX_KVM_SYNC_PROBE; -#endif - post_kvm_run_save(vcpu); - vapic_exit(vcpu); - - return (r); -} - -int -kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) -{ - int r; - sigset_t sigsaved; - struct kvm_run *kvm_run = vcpu->run; - vcpu_load(vcpu); - - if (vcpu->sigset_active) - kvm_sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - - if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) { - kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); - r = -EAGAIN; - goto out; + mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0); + if (vmx_init() != DDI_SUCCESS) { + ddi_soft_state_fini(&kvm_state); + ddi_remove_minor_node(dip, NULL); + mutex_destroy(&kvm_lock); + return (DDI_FAILURE); } - /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) - kvm_set_cr8(vcpu, kvm_run->cr8); - - if (vcpu->arch.pio.cur_count) { -#ifdef XXX - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - r = complete_pio(vcpu); -#ifdef XXX - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -#else - XXX_KVM_SYNC_PROBE; -#endif - if (r) - goto out; - } - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; -#ifdef XXX - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; -#endif - r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, - EMULTYPE_NO_DECODE); -#ifdef XXX - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -#else - XXX_KVM_SYNC_PROBE; -#endif - if (r == EMULATE_DO_MMIO) { - /* - * Read-modify-write. Back to userspace. - */ - r = 0; - goto out; - } + if (hardware_enable_all() != 0) { + ddi_soft_state_fini(&kvm_state); + ddi_remove_minor_node(dip, NULL); + mutex_destroy(&kvm_lock); + vmx_fini(); + return (DDI_FAILURE); } - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) - kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); - - r = __vcpu_run(vcpu); - -out: - if (vcpu->sigset_active) - kvm_sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - vcpu_put(vcpu); - return (r); -} - -int -kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_load(vcpu); - mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); - return (0); -} - -int -kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_load(vcpu); - vcpu->arch.mp_state = mp_state->mp_state; - vcpu_put(vcpu); - return (0); -} - -static int -kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_load(vcpu); - - events->exception.injected = vcpu->arch.exception.pending; - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.error_code = vcpu->arch.exception.error_code; - - events->interrupt.injected = vcpu->arch.interrupt.pending; - events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = vcpu->arch.interrupt.soft; - - events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); - - events->sipi_vector = vcpu->arch.sipi_vector; - - events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | - KVM_VCPUEVENT_VALID_SIPI_VECTOR); - - vcpu_put(vcpu); - - return (0); -} - -static int -kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | - KVM_VCPUEVENT_VALID_SIPI_VECTOR)) - return (-EINVAL); - - vcpu_load(vcpu); - - vcpu->arch.exception.pending = events->exception.injected; - vcpu->arch.exception.nr = events->exception.nr; - vcpu->arch.exception.has_error_code = events->exception.has_error_code; - vcpu->arch.exception.error_code = events->exception.error_code; - - vcpu->arch.interrupt.pending = events->interrupt.injected; - vcpu->arch.interrupt.nr = events->interrupt.nr; - vcpu->arch.interrupt.soft = events->interrupt.soft; - - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); - - vcpu->arch.nmi_injected = events->nmi.injected; - - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) - vcpu->arch.nmi_pending = events->nmi.pending; - - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + kvm_dip = dip; + kvm_base_minor = instance; - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) - vcpu->arch.sipi_vector = events->sipi_vector; + list_create(&vm_list, sizeof (struct kvm), + offsetof(struct kvm, vm_list)); + kvm_minor = vmem_create("kvm_minor", (void *)1, UINT32_MAX - 1, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); - vcpu_put(vcpu); + ddi_report_dev(dip); - return (0); + return (DDI_SUCCESS); } static int -kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr) -{ - kvm->arch.ept_identity_map_addr = ident_addr; - return (0); -} - -void -kvm_timer_fire(void *arg) +kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - struct kvm_timer *timer = (struct kvm_timer *)arg; - struct kvm_vcpu *vcpu = timer->vcpu; - - if (vcpu == NULL) - return; - - mutex_enter(&vcpu->kvcpu_kick_lock); - - if (timer->reinject || !timer->pending) { - atomic_add_32(&timer->pending, 1); - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - } - - timer->intervals++; - - cv_broadcast(&vcpu->kvcpu_kick_cv); - mutex_exit(&vcpu->kvcpu_kick_lock); -} + int instance; -static int -kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s)); - vcpu_put(vcpu); + if (cmd != DDI_DETACH) + return (DDI_FAILURE); - return (0); -} + VERIFY(kvm_dip != NULL && kvm_dip == dip); + instance = ddi_get_instance(dip); + VERIFY(instance == kvm_base_minor); + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + list_destroy(&vm_list); + vmem_destroy(kvm_minor); + kvm_dip = NULL; -static int -kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s)); - kvm_apic_post_state_restore(vcpu); - update_cr8_intercept(vcpu); - vcpu_put(vcpu); + hardware_disable_all(); + mutex_destroy(&kvm_lock); + ddi_soft_state_fini(&kvm_state); + vmx_fini(); - return (0); + return (DDI_SUCCESS); } +/*ARGSUSED*/ static int -kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { - int r; + kvm_devstate_t *rsp; + int error = DDI_FAILURE; - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], - sizeof (struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], - sizeof (struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = EINVAL; + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = kvm_dip; break; - } - - return (r); -} - -static int -kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - mutex_enter(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, - sizeof (struct kvm_pic_state)); - mutex_exit(&pic_irqchip(kvm)->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - mutex_enter(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, - sizeof (struct kvm_pic_state)); - mutex_exit(&pic_irqchip(kvm)->lock); - break; - case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)((uint64_t)getminor((dev_t)arg)); + error = DDI_SUCCESS; break; + default: - r = EINVAL; break; } - kvm_pic_update_irq(pic_irqchip(kvm)); - - return (r); -} - -static int -kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) -{ - if (irq->irq < 0 || irq->irq >= 256) - return (-EINVAL); - - if (irqchip_in_kernel(vcpu->kvm)) - return (-ENXIO); - - vcpu_load(vcpu); - - kvm_queue_interrupt(vcpu, irq->irq, 0); - - vcpu_put(vcpu); - - return (0); -} - -static int -kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp) -{ - int rval; - uint64_t mcg_cap = *mcg_capp; - unsigned bank_num = mcg_cap & 0xff, bank; - - rval = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) - goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) - goto out; - rval = 0; - vcpu->arch.mcg_cap = mcg_cap; - /* Init IA32_MCG_CTL to all 1s */ - if (mcg_cap & MCG_CTL_P) - vcpu->arch.mcg_ctl = ~(uint64_t)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) - vcpu->arch.mce_banks[bank*4] = ~(uint64_t)0; -out: - return (rval); -} - -static int -kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) -{ - if (sigset) { - vcpu->sigset_active = 1; - vcpu->sigset = *sigset; - } else - vcpu->sigset_active = 0; - - return (0); -} - -static int -kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) -{ - int r; - - if (msrs->nmsrs >= MAX_IO_MSRS) - return (-E2BIG); - - if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0) - return (r); - - *rv = r; - - return (0); + return (error); } +/*ARGSUSED*/ static int -kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) -{ - int r; - - if (msrs->nmsrs >= MAX_IO_MSRS) - return (-E2BIG); - - if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0) - return (-EINVAL); - - *rv = r; - - return (0); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int -kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +kvm_open(dev_t *devp, int flag, int otype, cred_t *credp) { - int r, i; - struct kvm_memory_slot *memslot; - unsigned long n; - unsigned long is_dirty = 0; - unsigned long *dirty_bitmap = NULL; - - mutex_enter(&kvm->slots_lock); - - r = EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) - goto out; - - memslot = &kvm->memslots->memslots[log->slot]; - r = ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); + minor_t minor; + kvm_devstate_t *ksp; - dirty_bitmap = kmem_alloc(n, KM_SLEEP); - memset(dirty_bitmap, 0, n); + if (flag & FEXCL || flag & FNDELAY) + return (EINVAL); - for (i = 0; !is_dirty && i < n / sizeof (long); i++) - is_dirty = memslot->dirty_bitmap[i]; + if (otype != OTYP_CHR) + return (EINVAL); - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - struct kvm_memslots *slots, *old_slots; + /* + * XXX This should be its own privilage + */ + if (drv_priv(credp) != 0) + return (EPERM); - mutex_enter(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - mutex_exit(&kvm->mmu_lock); + if (!(flag & FREAD && flag & FWRITE)) + return (EINVAL); - slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP); - if (!slots) - goto out_free; + if (getminor(*devp) != kvm_base_minor) + return (ENXIO); - memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots)); - slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + minor = (minor_t)(uintptr_t)vmem_alloc(kvm_minor, + 1, VM_BESTFIT | VM_SLEEP); - old_slots = kvm->memslots; -#ifdef XXX - rcu_assign_pointer(kvm->memslots, slots); - kvm_synchronize_srcu_expedited(&kvm->srcu); -#else - kvm->memslots = slots; - XXX_KVM_SYNC_PROBE; -#endif - dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; - kmem_free(old_slots, sizeof (struct kvm_memslots)); + if (ddi_soft_state_zalloc(kvm_state, minor) != 0) { + vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1); + return (ENXIO); } - r = 0; - if (copyout(dirty_bitmap, log->v.dirty_bitmap, n) != 0) - r = EFAULT; -out_free: - kmem_free(dirty_bitmap, n); -out: - mutex_exit(&kvm->slots_lock); - return (r); -} - -static int -kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - struct kvm_pit *vpit = kvm->arch.vpit; - - mutex_enter(&vpit->pit_state.lock); - memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels)); - ps->flags = vpit->pit_state.flags; - mutex_exit(&vpit->pit_state.lock); + *devp = makedevice(getmajor(*devp), minor); + ksp = ddi_get_soft_state(kvm_state, minor); + VERIFY(ksp != NULL); return (0); } +/*ARGSUSED*/ static int -kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +kvm_close(dev_t dev, int flag, int otyp, cred_t *cred) { - boolean_t prev_legacy, cur_legacy, start = B_FALSE; - struct kvm_pit *vpit = kvm->arch.vpit; - - mutex_enter(&vpit->pit_state.lock); - prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + kvm_devstate_t *ksp; + minor_t minor = getminor(dev); + kvm_t *kvmp; - if (!prev_legacy && cur_legacy) - start = B_TRUE; + VERIFY(getminor(dev) != kvm_base_minor); + ksp = ddi_get_soft_state(kvm_state, minor); - memcpy(&vpit->pit_state.channels, &ps->channels, - sizeof (vpit->pit_state.channels)); + if ((kvmp = ksp->kds_kvmp) != NULL) { + mutex_enter(&kvm_lock); - vpit->pit_state.flags = ps->flags; - kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start); + if (kvmp->kvm_clones > 0) { + kvmp->kvm_clones--; + mutex_exit(&kvm_lock); + } else { + mutex_exit(&kvm_lock); + kvm_destroy_vm(kvmp); + } + } - mutex_exit(&vpit->pit_state.lock); + ddi_soft_state_free(kvm_state, minor); + vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1); return (0); } @@ -6440,6 +2149,11 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) break; } + case KVM_GET_MSR_INDEX_LIST: { + rval = kvm_vm_ioctl_get_msr_index_list(NULL, arg); + *rv = 0; + break; + } case KVM_CREATE_VCPU: { uint32_t id = (uintptr_t)arg; struct kvm *kvmp; @@ -6507,58 +2221,6 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) kmem_free(cpuid, sizeof (struct kvm_cpuid2)); break; } - - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg; - struct kvm_msr_list *msr_list; - size_t sz = sizeof (struct kvm_msr_list); - unsigned n; - - msr_list = kmem_zalloc(sz, KM_SLEEP); - - if (copyin(user_msr_list, msr_list, sz) != 0) { - kmem_free(msr_list, sz); - rval = EFAULT; - break; - } - - n = msr_list->nmsrs; - msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); - - if (copyout(msr_list, user_msr_list, sz) != 0) { - kmem_free(msr_list, sz); - rval = EFAULT; - break; - } - - if (n < msr_list->nmsrs) { - kmem_free(msr_list, sz); - rval = E2BIG; - break; - } - - if (copyout(&msrs_to_save, user_msr_list->indices, - num_msrs_to_save * sizeof (uint32_t))) { - kmem_free(msr_list, sz); - rval = EFAULT; - break; - } - - if (copyout(&emulated_msrs, user_msr_list->indices + - num_msrs_to_save, ARRAY_SIZE(emulated_msrs) * - sizeof (uint32_t)) != 0) { - kmem_free(msr_list, sz); - rval = EFAULT; - break; - } - - kmem_free(msr_list, sz); - - rval = 0; - *rv = 0; - break; - } - case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask *sigmask = argp; struct kvm_signal_mask kvm_sigmask; @@ -7041,43 +2703,64 @@ kvm_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp, off_t len, } -static void -kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn) +static struct cb_ops kvm_cb_ops = { + kvm_open, + kvm_close, /* close */ + nodev, + nodev, + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + kvm_ioctl, + kvm_devmap, + nodev, /* mmap */ + kvm_segmap, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_NEW | D_MP | D_DEVMAP +}; + +static struct dev_ops kvm_ops = { + DEVO_REV, + 0, + kvm_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + kvm_attach, + kvm_detach, + nodev, /* reset */ + &kvm_cb_ops, + (struct bus_ops *)0 +}; + +static struct modldrv modldrv = { + &mod_driverops, + "kvm driver v0.1", + &kvm_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + { &modldrv, NULL } +}; + +int +_init(void) { - unsigned slot; - struct kvm_shared_msrs *locals = - (struct kvm_shared_msrs *)(((caddr_t)urn) - - offsetof(struct kvm_shared_msrs, urn)); - struct kvm_shared_msr_values *values; - - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; - if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); - values->curr = values->host; - } - } - locals->registered = 0; - kvm_user_return_notifier_unregister(vcpu, urn); + + return (mod_install(&modlinkage)); } -void -kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +int +_fini(void) { - kvm_arch_vcpu_uninit(vcpu); - ddi_umem_free(vcpu->cookie); + return (mod_remove(&modlinkage)); } -static int -kvm_avlmmucmp(const void *arg1, const void *arg2) +int +_info(struct modinfo *modinfop) { - const kvm_mmu_page_t *mp1 = arg1; - const kvm_mmu_page_t *mp2 = arg2; - if (mp1->kmp_avlspt > mp2->kmp_avlspt) - return (1); - if (mp1->kmp_avlspt < mp2->kmp_avlspt) - return (-1); - ASSERT(mp1->kmp_avlspt == mp2->kmp_avlspt); - return (0); + return (mod_info(&modlinkage, modinfop)); } /* END CSTYLED */ @@ -43,6 +43,35 @@ struct kvm; struct kvm_vcpu; +typedef struct kvm_user_return_notifier { + void (*on_user_return)(struct kvm_vcpu *, + struct kvm_user_return_notifier *); +} kvm_user_return_notifier_t; + +void kvm_user_return_notifier_register(struct kvm_vcpu *vcpu, + struct kvm_user_return_notifier *urn); +void kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu, + struct kvm_user_return_notifier *urn); +void kvm_fire_urn(struct kvm_vcpu *vcpu); + +#define KVM_NR_SHARED_MSRS 16 + +typedef struct kvm_shared_msrs_global { + int nr; + uint32_t msrs[KVM_NR_SHARED_MSRS]; +} kvm_shared_msrs_global_t; + +typedef struct kvm_shared_msrs { + struct kvm_user_return_notifier urn; + int registered; + struct kvm_shared_msr_values { + uint64_t host; + uint64_t curr; + } values[KVM_NR_SHARED_MSRS]; +} kvm_shared_msrs_t; + +extern struct kvm_shared_msrs **shared_msrs; + /* * It would be nice to use something smarter than a linear search, TBD... * Thankfully we dont expect many devices to register (famous last words :), @@ -316,12 +345,14 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg); int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); +int kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr); +int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); +int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); @@ -340,6 +371,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); +int kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv); +int kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv); +int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp); +int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); +int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); +int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); +int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); +int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr); +int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *, + struct kvm_cpuid_entry2 *); + int kvm_arch_init(void *opaque); void kvm_arch_exit(void); @@ -349,6 +398,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +struct kvm_vcpu * kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -444,20 +494,6 @@ void kvm_guest_enter(void); void kvm_guest_exit(void); void kvm_migrate_timers(struct kvm_vcpu *vcpu); -enum kvm_stat_kind { - KVM_STAT_VM, - KVM_STAT_VCPU, -}; - -typedef struct kvm_stats_debugfs_item { - const char *name; - int offset; - enum kvm_stat_kind kind; - struct dentry *dentry; -} kvm_stats_debugfs_item_t; -extern struct kvm_stats_debugfs_item debugfs_entries[]; -extern struct dentry *kvm_debugfs_dir; - #ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION #define unalias_gfn_instantiation unalias_gfn #endif @@ -488,27 +524,8 @@ void kvm_sigprocmask(int how, sigset_t *, sigset_t *); */ #define offset_in_page(p) ((unsigned long)(p) & ~PAGEMASK) -/* borrowed liberally from linux... */ - -#define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) - -#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ -#define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P #define page_to_pfn(page) (page->p_pagenum) - /* LDT or TSS descriptor in the GDT. 16 bytes. */ struct ldttss_desc64 { unsigned short limit0; @@ -701,6 +701,13 @@ kvm_page_table_hashfn(gfn_t gfn) return (gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1)); } +static void +bitmap_zero(unsigned long *dst, int nbits) +{ + int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long); + memset(dst, 0, len); +} + static struct kvm_mmu_page * kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, uint64_t *parent_pte) { @@ -3000,3 +3007,34 @@ is_present_gpte(unsigned long pte) { return (pte & PT_PRESENT_MASK); } + +static struct kvm_mmu_page * +page_private(kvm_t *kvmp, page_t *page) +{ + kvm_mmu_page_t mp, *res; + mp.kmp_avlspt = (uintptr_t)page; + mutex_enter(&kvmp->kvm_avllock); + res = avl_find(&kvmp->kvm_avlmp, &mp, NULL); + mutex_exit(&kvmp->kvm_avllock); + ASSERT(res != NULL); + return (res); +} + +struct kvm_mmu_page * +page_header(kvm_t *kvmp, hpa_t shadow_page) +{ + return (page_private(kvmp, pfn_to_page(shadow_page >> PAGESHIFT))); +} + +int +kvm_avlmmucmp(const void *arg1, const void *arg2) +{ + const kvm_mmu_page_t *mp1 = arg1; + const kvm_mmu_page_t *mp2 = arg2; + if (mp1->kmp_avlspt > mp2->kmp_avlspt) + return (1); + if (mp1->kmp_avlspt < mp2->kmp_avlspt) + return (-1); + ASSERT(mp1->kmp_avlspt == mp2->kmp_avlspt); + return (0); +} @@ -58,5 +58,6 @@ extern int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *, extern void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); extern int kvm_mmu_reload(struct kvm_vcpu *vcpu); extern int is_present_gpte(unsigned long); +extern int kvm_avlmmucmp(const void *arg1, const void *arg2); #endif @@ -195,21 +195,7 @@ kvm_xcall(processorid_t cpu, kvm_xcall_t func, void *arg) kpreempt_enable(); } -uint32_t -bit(int bitno) -{ - return (1 << (bitno & 31)); -} -int -is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return (vcpu->arch.efer & EFER_LMA); -#else - return (0); -#endif -} unsigned short kvm_read_fs(void) @@ -357,3 +343,154 @@ get_page(page_t *page) { page = compound_head(page); } + + +page_t * +pfn_to_page(pfn_t pfn) +{ + return (page_numtopp_nolock(pfn)); +} + + +inline void +kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = 0; +} + +inline void +kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector, int soft) +{ + vcpu->arch.interrupt.pending = 1; + vcpu->arch.interrupt.soft = soft; + vcpu->arch.interrupt.nr = vector; +} + +inline void +kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = 0; +} + +int +kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + vcpu->arch.nmi_injected); +} + +inline int +kvm_exception_is_soft(unsigned int nr) +{ + return (nr == BP_VECTOR) || (nr == OF_VECTOR); +} + +inline int +is_protmode(struct kvm_vcpu *vcpu) +{ + return (kvm_read_cr0_bits(vcpu, X86_CR0_PE)); +} + +int +is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return (vcpu->arch.efer & EFER_LMA); +#else + return (0); +#endif +} + +inline int +is_pae(struct kvm_vcpu *vcpu) +{ + return (kvm_read_cr4_bits(vcpu, X86_CR4_PAE)); +} + +int +is_pse(struct kvm_vcpu *vcpu) +{ + return (kvm_read_cr4_bits(vcpu, X86_CR4_PSE)); +} + +int +is_paging(struct kvm_vcpu *vcpu) +{ + return (kvm_read_cr0_bits(vcpu, X86_CR0_PG)); +} + +uint64_t +native_read_msr_safe(unsigned int msr, int *err) +{ + DECLARE_ARGS(val, low, high); + uint64_t ret = 0; + on_trap_data_t otd; + + if (on_trap(&otd, OT_DATA_ACCESS) == 0) { + ret = native_read_msr(msr); + *err = 0; + } else { + *err = EINVAL; /* XXX probably not right... */ + } + no_trap(); + + return (ret); +} + +/* Can be uninlined because referenced by paravirt */ +int +native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int err = 0; + on_trap_data_t otd; + + if (on_trap(&otd, OT_DATA_ACCESS) == 0) { + native_write_msr(msr, low, high); + } else { + err = EINVAL; /* XXX probably not right... */ + } + no_trap(); + + return (err); +} + + +/* XXX Where should this live */ +page_t * +alloc_page(size_t size, int flag) +{ + caddr_t page_addr; + pfn_t pfn; + page_t *pp; + + if ((page_addr = kmem_zalloc(size, flag)) == NULL) + return ((page_t *)NULL); + + pp = page_numtopp_nolock(hat_getpfnum(kas.a_hat, page_addr)); + return (pp); +} + +int +kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return (vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id); +} + +/* + * Often times we have pages that correspond to addresses that are in a users + * virtual address space. Rather than trying to constantly map them in and out + * of our address space we instead go through and use the kpm segment to + * facilitate this for us. This always returns an address that is always in the + * kernel's virtual address space. + */ +caddr_t +page_address(page_t *page) +{ + return (hat_kpm_mapin_pfn(page->p_pagenum)); +} + +uint32_t +bit(int bitno) +{ + return (1 << (bitno & 31)); +} @@ -35,6 +35,12 @@ #include "kvm_mmu.h" #include "kvm_vmx.h" + +/* + * Globals + */ +struct kvm_shared_msrs **shared_msrs; + #define VMX_NR_VPIDS (1 << 16) static kmutex_t vmx_vpid_lock; static ulong_t *vmx_vpid_bitmap; @@ -57,8 +63,6 @@ static kmem_cache_t *kvm_vcpu_cache; static struct vmcs **vmxarea; /* 1 per cpu */ static struct vmcs **current_vmcs; -/* XXX Should shared_msrs be static? */ -struct kvm_shared_msrs **shared_msrs; static list_t **vcpus_on_cpu; static uint64_t *vmxarea_pa; /* physical address of each vmxarea */ @@ -587,6 +591,30 @@ vmwrite_error(unsigned long field, unsigned long value) field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } +static void +__vmwrite(unsigned long field, unsigned long value) +{ + uint8_t err = 0; + + /*CSTYLED*/ + __asm__ volatile ( ASM_VMX_VMWRITE_RAX_RDX "\n\t" "setna %0" + /* XXX: CF==1 or ZF==1 --> crash (ud2) */ + /* "ja 1f ; ud2 ; 1:\n" */ + : "=q"(err) : "a" (value), "d" (field) + : "cc", "memory"); + + /* XXX the following should be ifdef debug... */ + if (err) { +#ifdef XXX + vmcs_read32(VM_INSTRUCTION_ERROR); + cmn_err(CE_WARN, "_vmwrite: error writing %lx to %lx: " + "error number = %d\n", value, field, err & 0xff); +#else + XXX_KVM_PROBE; +#endif + } +} + /* XXX Should be static! */ void vmcs_writel(unsigned long field, unsigned long value) @@ -1,38 +1,179 @@ +/* + * + */ + #include <sys/types.h> #include <sys/param.h> -#include <sys/errno.h> -#include <sys/modctl.h> -#include <sys/kmem.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/stat.h> +#include <sys/mutex.h> +#include <sys/ksynch.h> +#include <sys/condvar_impl.h> #include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/cpuvar.h> -#include <vm/hat_i86.h> -#include <sys/segments.h> -#include <sys/mman.h> -#include <sys/mach_mmu.h> -#include <sys/int_limits.h> -#include <sys/x_call.h> + +#include <vm/page.h> +#include <vm/hat.h> + +#include <asm/cpu.h> #include "kvm_bitops.h" -#include "kvm_apicdef.h" -#include "kvm_types.h" +#include "kvm_vmx.h" +#include "msr-index.h" +#include "msr.h" +#include "irqflags.h" #include "kvm_host.h" - -#include "kvm_coalesced_mmio.h" +#include "kvm_lapic.h" +#include "processor-flags.h" +#include "kvm_cpuid.h" +#include "hyperv.h" +#include "kvm_apicdef.h" +#include "kvm_iodev.h" +#include "kvm.h" +#include "kvm_x86impl.h" #include "kvm_irq.h" +#include "kvm_tss.h" +#include "kvm_ioapic.h" +#include "kvm_coalesced_mmio.h" #include "kvm_i8254.h" -#include "kvm_x86impl.h" +#include "kvm_mmu.h" +#include "kvm_cache_regs.h" -#undef DEBUG - -extern struct kvm_shared_msrs_global shared_msrs_global; -extern void shared_msr_update(unsigned slot, uint32_t msr); +/* XXX These don't belong here! */ extern caddr_t smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); +#ifdef XXX_KVM_DECLARATION +unsigned long *vmx_io_bitmap_a; +unsigned long *vmx_io_bitmap_b; +unsigned long *vmx_msr_bitmap_legacy; +unsigned long *vmx_msr_bitmap_longmode; +#else +/* make these arrays to try to force into low 4GB memory... */ +/* also need to be aligned... */ +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long + vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)]; +#endif + +static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)]; + +#define MAX_IO_MSRS 256 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +/* + * EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL; +#endif + +static void update_cr8_intercept(struct kvm_vcpu *); + +struct kvm_x86_ops *kvm_x86_ops; +int ignore_msrs = 0; + +static struct kvm_shared_msrs_global shared_msrs_global; + +static void +kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn) +{ + unsigned slot; + struct kvm_shared_msrs *locals = + (struct kvm_shared_msrs *)(((caddr_t)urn) - + offsetof(struct kvm_shared_msrs, urn)); + struct kvm_shared_msr_values *values; + + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; + } + } + locals->registered = 0; + kvm_user_return_notifier_unregister(vcpu, urn); +} + +static void +shared_msr_update(unsigned slot, uint32_t msr) +{ + struct kvm_shared_msrs *smsr; + uint64_t value; + smsr = shared_msrs[CPU->cpu_id]; + + /* + * only read, and nobody should modify it at this time, + * so don't need lock + */ + if (slot >= shared_msrs_global.nr) { + cmn_err(CE_WARN, "kvm: invalid MSR slot!"); + return; + } + + rdmsrl_safe(msr, (unsigned long long *)&value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void +kvm_define_shared_msr(unsigned slot, uint32_t msr) +{ + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot] = msr; +#ifdef XXX + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); +#else + XXX_KVM_SYNC_PROBE; +#endif +} + +static void +kvm_shared_msr_cpu_online(void) +{ + unsigned i; + + for (i = 0; i < shared_msrs_global.nr; i++) + shared_msr_update(i, shared_msrs_global.msrs[i]); +} + +void +kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value, + uint64_t mask) +{ + struct kvm_shared_msrs *smsr = shared_msrs[CPU->cpu_id]; + + if (((value ^ smsr->values[slot].curr) & mask) == 0) + return; + + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); + + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + kvm_user_return_notifier_register(vcpu, &smsr->urn); + smsr->registered = 1; + } +} + unsigned long segment_base(uint16_t selector) { @@ -64,253 +205,1353 @@ segment_base(uint16_t selector) return (v); } +uint64_t +kvm_get_apic_base(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return (vcpu->arch.apic_base); + else + return (vcpu->arch.apic_base); +} -struct kvm * -kvm_arch_create_vm(void) +void +kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data) { - struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP); + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; +} - if (!kvm) - return (NULL); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 - if ((kvm->arch.aliases = - kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) { - kmem_free(kvm, sizeof (struct kvm)); - return (NULL); +static int +exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return (EXCPT_PF); + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return (EXCPT_CONTRIBUTORY); + default: + break; } - list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page), - offsetof(struct kvm_mmu_page, link)); + return (EXCPT_BENIGN); +} - list_create(&kvm->arch.assigned_dev_head, - sizeof (struct kvm_assigned_dev_kernel), - offsetof(struct kvm_assigned_dev_kernel, list)); +static void +kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, int has_error, uint32_t error_code) +{ + uint32_t prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { +queue: + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || + (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = 1; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else { + /* + * replace previous exception with a new one in a hope + * that instruction re-execution will regenerate lost + * exception + */ + goto queue; + } +} - /* XXX - original is rdtscll() */ - kvm->arch.vm_init_tsc = (uint64_t)gethrtime(); +void +kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, 0, 0); +} - return (kvm); +void +kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + uint32_t error_code) +{ + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest); + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } -inline gpa_t -gfn_to_gpa(gfn_t gfn) +void +kvm_inject_nmi(struct kvm_vcpu *vcpu) { - return ((gpa_t)gfn << PAGESHIFT); + vcpu->arch.nmi_pending = 1; } -page_t *pfn_to_page(pfn_t pfn); +void +kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code) +{ + kvm_multiple_exception(vcpu, nr, 1, error_code); +} + +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +int +kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) +{ + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return (1); + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return (0); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int +load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGESHIFT; + unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2; + int i; + int ret; + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, + pdpte, offset * sizeof (uint64_t), sizeof (pdpte)); + + if (ret < 0) { + ret = 0; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(pdpte); i++) { + if (is_present_gpte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty); +out: + return (ret); +} + +static int +pdptrs_changed(struct kvm_vcpu *vcpu) +{ + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return (0); + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) { + return (1); + } + + if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, + pdpte, sizeof (pdpte)) < 0) + return (1); + + return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0); +} + +void +kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { + kvm_inject_gp(vcpu, 0); + return; + } +#endif + + cr0 &= ~CR0_RESERVED_BITS; + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->arch.efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + kvm_inject_gp(vcpu, 0); + return; + + } + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + kvm_mmu_reset_context(vcpu); +} + +void +kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); +} + +void +kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { + kvm_inject_gp(vcpu, 0); + return; + } + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && + ((cr4 ^ old_cr4) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (cr4 & X86_CR4_VMXE) { + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} + +void +kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + return; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT))) + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } +} + +void +kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; +} + +unsigned long +kvm_get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) { + return (kvm_lapic_get_cr8(vcpu)); + } else { + return (vcpu->arch.cr8); + } +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. + */ + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_SAVE_MSRS_BEGIN 5 +static uint32_t msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA +}; + +static unsigned num_msrs_to_save; + +static uint32_t emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +static void +set_efer(struct kvm_vcpu *vcpu, uint64_t efer) +{ + if (efer & efer_reserved_bits) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + vcpu->arch.efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} void -kvm_release_pfn_clean(pfn_t pfn) +kvm_enable_efer_bits(uint64_t mask) +{ + efer_reserved_bits &= ~mask; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int +kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data) +{ + return (kvm_x86_ops->set_msr(vcpu, msr_index, data)); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int +do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data) +{ + return (kvm_set_msr(vcpu, index, *data)); +} + +static void +kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { + static int version; + struct pvclock_wall_clock wc; + struct timespec boot; + + if (!wall_clock) + return; + + version++; + + kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); + + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) - put_page(pfn_to_page(pfn)); + getboottime(&boot); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof (version)); #else XXX_KVM_PROBE; #endif } -#ifdef IOMMU +static uint32_t +div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; -paddr_t -iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova) + /* + * Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" + */ + __asm__("divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor)); + + return (quotient); +} + +static void +kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) { - return (iommu_ops->iova_to_phys(domain, iova)); + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); } +/* XXX Expected to be per cpu */ +static uint64_t cpu_tsc_khz; +/* XXX extern?! */ +extern uint64_t cpu_freq_hz; -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) +static void +kvm_write_guest_time(struct kvm_vcpu *v) { - gfn_t gfn = base_gfn; - pfn_t pfn; - struct iommu_domain *domain = kvm->arch.iommu_domain; - unsigned long i; - uint64_t phys; + struct timespec ts; + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + unsigned long this_tsc_khz; - /* check if iommu exists and in use */ - if (!domain) + if ((!vcpu->time_page)) return; - for (i = 0; i < npages; i++) { - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - pfn = phys >> PAGESHIFT; - kvm_release_pfn_clean(pfn); - gfn++; + this_tsc_khz = cpu_tsc_khz; + if (vcpu->hv_clock_tsc_khz != this_tsc_khz) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } - iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages); +#ifdef XXX + put_cpu_var(cpu_tsc_khz); +#else + XXX_KVM_PROBE; +#endif + +#ifdef XXX + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); +#else + /* + * may need to mask interrupts for local_irq_save, and unmask + * for local_irq_restore. cli()/sti() might be done... + */ + XXX_KVM_PROBE; +#endif + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + gethrestime(&ts); +#ifdef XXX + monotonic_to_bootbased(&ts); + local_irq_restore(flags); +#else + XXX_KVM_PROBE; +#endif + + /* With all the info we got, fill in the values */ + + vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * + (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just increase by 2 at the end. + */ + vcpu->hv_clock.version += 2; + + shared_kaddr = page_address(vcpu->time_page); + + memcpy((void *)((uintptr_t)shared_kaddr + vcpu->time_offset), + &vcpu->hv_clock, sizeof (vcpu->hv_clock)); + + mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT); } static int -kvm_iommu_unmap_memslots(struct kvm *kvm) +kvm_request_guest_time_update(struct kvm_vcpu *v) { - int i; - struct kvm_memslots *slots; + struct kvm_vcpu_arch *vcpu = &v->arch; - slots = kvm->memslots; + if (!vcpu->time_page) + return (0); + + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + + return (1); +} - for (i = 0; i < slots->nmemslots; i++) { - kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, - slots->memslots[i].npages); +static int +msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return (1); + case 0x2f8: + return (1); } return (0); } -int -kvm_iommu_unmap_guest(struct kvm *kvm) +static int +valid_pat_type(unsigned t) { - struct iommu_domain *domain = kvm->arch.iommu_domain; + return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */ +} + +static int +valid_mtrr_type(unsigned t) +{ + return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */ +} - /* check if iommu exists and in use */ - if (!domain) +static int +mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + int i; + + if (!msr_mtrr_valid(msr)) return (0); - kvm_iommu_unmap_memslots(kvm); - iommu_domain_free(domain); + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return (0); + return (1); + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return (0); + return (valid_mtrr_type(data & 0xff)); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return (0); + return (1); + } + + /* variable MTRRs */ + return (valid_mtrr_type(data & 0xff)); +} + +static int +set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + struct mtrr_state_type *state = &vcpu->arch.mtrr_state; + + uint64_t *p = (uint64_t *)&state->fixed_ranges; + + if (!mtrr_valid(vcpu, msr, data)) + return (1); + + if (msr == MSR_MTRRdefType) { + state->def_type = data; + state->enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + + if (!is_mtrr_mask) { + pt = (uint64_t *)&state->var_ranges[idx].base_lo; + } else { + pt = (uint64_t *)&state->var_ranges[idx].mask_lo; + } + + *pt = data; + } + + kvm_mmu_reset_context(vcpu); + return (0); } -#endif /* IOMMU */ -static void -kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +static int +set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) { - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return (1); + if (data != 0 && data != ~(uint64_t)0) + return (-1); + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + /* + * only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10)) != ~(uint64_t)0) + return (-1); + vcpu->arch.mce_banks[offset] = data; + break; + } + return (1); + } + return (0); } -static void -kvm_free_vcpus(struct kvm *kvmp) +static int +xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data) { - int ii, maxcpus; + struct kvm *kvm = vcpu->kvm; + int lm = is_long_mode(vcpu); + uint8_t *blob_addr = lm ? + (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 : + (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32; + uint8_t blob_size = lm ? + kvm->arch.xen_hvm_config.blob_size_64 : + kvm->arch.xen_hvm_config.blob_size_32; + uint32_t page_num = data & ~PAGEMASK; + uint64_t page_addr = data & PAGEMASK; + uint8_t *page; + int r; - maxcpus = kvmp->online_vcpus; - XXX_KVM_SYNC_PROBE; - for (ii = 0; ii < maxcpus; ii++) - kvm_unload_vcpu_mmu(kvmp->vcpus[ii]); + r = E2BIG; + if (page_num >= blob_size) + goto out; + r = ENOMEM; + page = kmem_alloc(PAGESIZE, KM_SLEEP); + r = EFAULT; + if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE)) + goto out_free; + if (kvm_write_guest(kvm, page_addr, page, PAGESIZE)) + goto out_free; + r = 0; +out_free: + kmem_free(page, PAGESIZE); +out: + return (r); +} - for (ii = 0; ii < maxcpus; ii++) - kvm_arch_vcpu_free(kvmp->vcpus[ii]); - mutex_enter(&kvmp->lock); - for (ii = 0; ii < maxcpus; ii++) - kvmp->vcpus[ii] = NULL; - kvmp->online_vcpus = 0; - mutex_exit(&kvmp->lock); +static int +kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE); } -/* - * This function exists because of a difference in methodologies from our - * ancestor. With our ancestors, there is no imputus to clean up lists and - * mutexes. This is unfortunate, because they seem to even have debug kernels - * which would seemingly check for these kinds of things. But because in the - * common case mutex_exit is currently a #define to do {} while(0), it seems - * that they just ignore this. - * - * This leads to the following behavior: during our time we create a lot of - * auxillary structs potentially related to pits, apics, etc. Tearing down these - * structures relies on having the correct locks, etc. However - * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing - * the kmem_free. Logically these auxillary structures need to be freed and - * dealt with before we go back and do the rest of the tear down related to the - * device. - */ -void -kvm_arch_destroy_vm_comps(struct kvm *kvmp) +static int +kvm_hv_msr_partition_wide(uint32_t msr) { - if (kvmp == NULL) + int r = 0; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = 1; + break; + } -#ifdef IOMMU - kvm_iommu_unmap_guest(kvmp); + return (r); +} + +static int +set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + uint64_t gfn; + unsigned long addr; + uint8_t instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return (1); + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copyout(instructions, (caddr_t)addr, 4)) + return (1); + kvm->arch.hv_hypercall = data; + break; + } + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%lx\n", msr, data); + return (1); + } + + return (0); +} + +static int +clear_user(void *addr, unsigned long size) +{ + caddr_t ka; + int rval = 0; + + ka = kmem_zalloc(size, KM_SLEEP); + rval = copyout(ka, addr, size); + kmem_free(ka, size); + + return (rval); +} + +static int +set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + + addr = gfn_to_hva(vcpu->kvm, + data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + + if (kvm_is_error_hva(addr)) + return (1); + + if (clear_user((void *)addr, PAGESIZE)) + return (1); + + vcpu->arch.hv_vapic = data; + break; + } + + case HV_X64_MSR_EOI: + return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data)); + case HV_X64_MSR_ICR: + return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data)); + case HV_X64_MSR_TPR: + return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data)); + + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%lx\n", msr, data); + return (1); + } + + return (0); +} + +int +kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + switch (msr) { + case MSR_EFER: + set_efer(vcpu, data); + break; + case MSR_K7_HWCR: + data &= ~(uint64_t)0x40; /* ignore flush filter disable */ + if (data != 0) { + cmn_err(CE_NOTE, + "unimplemented HWCR wrmsr: 0x%lx\n", data); + return (1); + } + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%lx\n", data); + return (1); + } + break; + case MSR_AMD64_NB_CFG: + break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* + * Values other than LBR and BTF are vendor-specific, + * thus reserved and should throw a #GP + */ + return (1); + } + cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n", + __func__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + break; + case 0x200 ... 0x2ff: + return (set_msr_mtrr(vcpu, msr, data)); + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return (kvm_x2apic_msr_write(vcpu, msr, data)); + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { +#ifdef XXX + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } #else - XXX_KVM_PROBE; -#endif /* IOMMU */ - kvm_free_pit(kvmp); - kvm_free_vcpus(kvmp); - kvm_free_physmem(kvmp); + XXX_KVM_PROBE; +#endif + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1); #ifdef XXX -#ifdef APIC - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); - if (kvm->arch.ept_identity_pagetable) - put_page(kvm->arch.ept_identity_pagetable); -#endif /* APIC */ + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGESHIFT); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_request_guest_time_update(vcpu); #else - XXX_KVM_PROBE; -#endif /* XXX */ -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - cleanup_srcu_struct(&kvm->srcu); -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ + XXX_KVM_PROBE; +#endif + break; + } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return (set_msr_mce(vcpu, msr, data)); + + /* + * Performance counters are not protected by a CPUID bit, so we should + * check all of them in the generic path for the sake of cross vendor + * migration. Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%lx\n", msr, data); + break; + /* + * at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%lx\n", msr, data); + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_exit(&vcpu->kvm->lock); + return (r); + } else + return (set_msr_hyperv(vcpu, msr, data)); + break; + default: + if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) + return (xen_hvm_config(vcpu, data)); + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %lx\n", + msr, data); + return (1); + } else { + cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %lx\n", + msr, data); + break; + } + } + + return (0); } -void -kvm_arch_destroy_vm(struct kvm *kvmp) +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int +kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata) { - if (kvmp == NULL) - return; /* nothing to do here */ + return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata)); +} - if (kvmp->arch.aliases) { - kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases)); - kvmp->arch.aliases = NULL; +static int +get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + struct mtrr_state_type *state = &vcpu->arch.mtrr_state; + uint64_t *p = (uint64_t *)&state->fixed_ranges; + + if (!msr_mtrr_valid(msr)) + return (1); + + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) { + pt = (uint64_t *)&state->var_ranges[idx].base_lo; + } else { + pt = (uint64_t *)&state->var_ranges[idx].mask_lo; + } + + *pdata = *pt; } - kmem_free(kvmp, sizeof (struct kvm)); + + return (0); } -#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ -#define MSR_IA32_FEATURE_CONTROL 0x0000003a +static int +get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + data = 0; + break; + case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return (1); + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return (1); + } + *pdata = data; + return (0); +} -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) +static int +get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data = 0; + struct kvm *kvm = vcpu->kvm; -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return (1); + } -void -kvm_shared_msr_cpu_online(void) + *pdata = data; + + return (0); +} + +static int +get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) { - unsigned i; + uint64_t data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata)); + case HV_X64_MSR_ICR: + return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata)); + case HV_X64_MSR_TPR: + return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata)); + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return (1); + } - for (i = 0; i < shared_msrs_global.nr; i++) - shared_msr_update(i, shared_msrs_global.msrs[i]); + *pdata = data; + return (0); } int -kvm_arch_hardware_enable(void *garbage) +kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + + switch (msr) { + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_UCODE_REV: + case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: + case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: + data = 0; + break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return (get_msr_mtrr(vcpu, msr, pdata)); + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return (kvm_x2apic_msr_read(vcpu, msr, pdata)); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + data = 1000ULL; + /* CPU multiplier */ + data |= (((uint64_t)4ULL) << 40); + break; + case MSR_EFER: + data = vcpu->arch.efer; + break; + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return (get_msr_mce(vcpu, msr, pdata)); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_exit(&vcpu->kvm->lock); + return (r); + } else + return (get_msr_hyperv(vcpu, msr, pdata)); + break; + default: + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr); + return (1); + } else { + cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; + } + *pdata = data; + + return (0); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int +__msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, uint64_t *data)) { + int i, idx; + + vcpu_load(vcpu); + #ifdef XXX - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; + idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + for (i = 0; i < msrs->nmsrs; i++) { + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; } + +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, idx); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - kvm_shared_msr_cpu_online(); + vcpu_put(vcpu); - return (kvm_x86_ops->hardware_enable(garbage)); + return (i); } -void -kvm_arch_hardware_disable(void *garbage) +int +kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) { - kvm_x86_ops->hardware_disable(garbage); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - drop_user_return_notifiers(garbage); -#endif + int r; + + if (msrs->nmsrs >= MAX_IO_MSRS) + return (-E2BIG); + + if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0) + return (r); + + *rv = r; + + return (0); } -static inline int -iommu_found(void) +int +kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv) { + int r; + + if (msrs->nmsrs >= MAX_IO_MSRS) + return (-E2BIG); + + if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0) + return (-EINVAL); + + *rv = r; + return (0); } @@ -374,7 +1615,7 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p) r = EINVAL; break; case KVM_CAP_IOMMU: - *rval_p = iommu_found(); + *rval_p = 0; r = DDI_SUCCESS; break; case KVM_CAP_MCE: @@ -389,598 +1630,3166 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p) return (r); } -static inline int -apic_x2apic_mode(struct kvm_lapic *apic) +/* XXX Some part of kvm_ioctl goes here? */ + +void +kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - return (apic->vcpu->arch.apic_base & X2APIC_ENABLE); + kvm_x86_ops->vcpu_load(vcpu, cpu); +#ifdef XXX + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { + unsigned long khz = cpufreq_quick_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } +#else + XXX_KVM_PROBE; +#endif + kvm_request_guest_time_update(vcpu); } void -kvm_inject_nmi(struct kvm_vcpu *vcpu) +kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = 1; + kvm_put_guest_fpu(vcpu); + + kvm_x86_ops->vcpu_put(vcpu); +} + +static int +is_efer_nx(void) +{ + unsigned long long efer = 0; + + rdmsrl_safe(MSR_EFER, &efer); + return (efer & EFER_NX); } int -kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +{ + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + return (E2BIG); + + bcopy(cpuid->entries, vcpu->arch.cpuid_entries, + cpuid->nent * sizeof (struct kvm_cpuid_entry2)); + + vcpu_load(vcpu); + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { - page_t *page; - struct kvm *kvm; int r; + struct kvm_cpuid_entry2 *entries = cpuid->entries; - kvm = vcpu->kvm; + cpuid->nent = vcpu->arch.cpuid_nent; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + if (cpuid->nent < vcpu->arch.cpuid_nent) + return (E2BIG); - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + bcopy(&vcpu->arch.cpuid_entries, cpuid->entries, + vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2)); + return (0); +} + +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + __asm__ volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +#define __cpuid native_cpuid + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void +cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +static void +do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + + +#define F(x) bit(X86_FEATURE_##x) + +static void +do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function, + uint32_t index, int *nent, int maxnent) +{ + unsigned int ddic; + unsigned f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; + unsigned f_lm = F(LM); +#else + unsigned f_gbpages = 0; + unsigned f_lm = 0; +#endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + + /* cpuid 1.edx */ + const uint32_t kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const uint32_t kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const uint32_t kvm_supported_word4_x86_features = + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved, XSAVE, OSXSAVE */; + /* cpuid 0x80000001.ecx */ + const uint32_t kvm_supported_word6_x86_features = + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; + + /* all calls to cpuid_count() should be made on the same cpu */ + /* XXX - right now, system panics at ddi_exit_critical() */ + /* XXX - to run everything on same cpu, bind qemu at startup */ + + kpreempt_disable(); + + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (uint32_t)0xb); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + entry->ecx &= kvm_supported_word4_x86_features; + /* + * we support x2apic emulation even if host does not support + * it since we emulate x2apic in software + */ + entry->ecx |= F(X2APIC); + break; /* - * page = alloc_page(PAGESIZE, KM_SLEEP); - * if (!page) { - * r = ENOMEM; - * goto fail; - * } - * vcpu->arch.pio_data = page_address(page); + * function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - vcpu->arch.pio_data = (caddr_t)vcpu->run + - (KVM_PIO_PAGE_OFFSET * PAGESIZE); + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int i, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + cache_type = entry[i - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int i, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + level_type = entry[i - 1].ecx & 0xff00; + if (!level_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + entry->ecx &= kvm_supported_word6_x86_features; + break; + } + /* + * XXX - see comment above for ddi_enter_critical() + * + * ddi_exit_critical(ddic); + */ + kpreempt_enable(); +} - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail; +#undef F - if (irqchip_in_kernel(kvm)) { - r = kvm_create_lapic(vcpu); - if (r < 0) - goto fail_mmu_destroy; - } +int +kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = E2BIG; + uint32_t func; + int allocsize = 0; - vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * - sizeof (uint64_t) * 4, KM_SLEEP); + if (cpuid->nent < 1) + goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; + r = ENOMEM; + allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent; + cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP); + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + + r = E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + r = E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + r = EFAULT; + if (copyout(cpuid_entries, entries, + nent * sizeof (kvm_cpuid_entry2_t))) + goto out_free; + + cpuid->nent = nent; + r = 0; + +out_free: + kmem_free(cpuid_entries, allocsize); +out: + return (r); +} - if (!vcpu->arch.mce_banks) { - r = ENOMEM; - goto fail_free_lapic; - } +int +kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s)); + vcpu_put(vcpu); - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return (0); +} + +int +kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s)); + kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); + vcpu_put(vcpu); return (0); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail: - return (r); } -void -kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +int +kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 * - KVM_MAX_MCE_BANKS); - kvm_free_lapic(vcpu); - kvm_mmu_destroy(vcpu); + if (irq->irq < 0 || irq->irq >= 256) + return (-EINVAL); + + if (irqchip_in_kernel(vcpu->kvm)) + return (-ENXIO); + + vcpu_load(vcpu); + + kvm_queue_interrupt(vcpu, irq->irq, 0); + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp) +{ + int rval; + uint64_t mcg_cap = *mcg_capp; + unsigned bank_num = mcg_cap & 0xff, bank; + + rval = -EINVAL; + if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + rval = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(uint64_t)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(uint64_t)0; +out: + return (rval); } +int +kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; + events->exception.nr = vcpu->arch.exception.nr; + events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.error_code = vcpu->arch.exception.error_code; + + events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.nr = vcpu->arch.interrupt.nr; + events->interrupt.soft = vcpu->arch.interrupt.soft; + + events->nmi.injected = vcpu->arch.nmi_injected; + events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + + events->sipi_vector = vcpu->arch.sipi_vector; + + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | + KVM_VCPUEVENT_VALID_SIPI_VECTOR); + + vcpu_put(vcpu); + + return (0); +} int -kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) { - int r; + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | + KVM_VCPUEVENT_VALID_SIPI_VECTOR)) + return (-EINVAL); + + vcpu_load(vcpu); + + vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.has_error_code = events->exception.has_error_code; + vcpu->arch.exception.error_code = events->exception.error_code; + + vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.nr = events->interrupt.nr; + vcpu->arch.interrupt.soft = events->interrupt.soft; - mutex_init(&vcpu->mutex, NULL, MUTEX_DRIVER, 0); - vcpu->cpu = -1; - vcpu->kvm = kvm; - vcpu->vcpu_id = id; + if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + + vcpu->arch.nmi_injected = events->nmi.injected; + + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; + + kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr) +{ + /* + * XXX They have some other code here to check the validity of the + * address + */ + return (kvm_x86_ops->set_tss_addr(kvmp, addr)); +} + +int +kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return (0); +} + +gfn_t +unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; #ifdef XXX - init_waitqueue_head(&vcpu->wq); + aliases = rcu_dereference(kvm->arch.aliases); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; + aliases = kvm->arch.aliases; #endif - vcpu->run = ddi_umem_alloc(PAGESIZE * 2, DDI_UMEM_SLEEP, &vcpu->cookie); - r = kvm_arch_vcpu_init(vcpu); + for (i = 0; i < aliases->naliases; i++) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn && + gfn < alias->base_gfn + alias->npages) + return (alias->target_gfn + gfn - alias->base_gfn); + } - if (r != 0) { - vcpu->run = NULL; - ddi_umem_free(vcpu->cookie); - return (r); + return (gfn); +} + +gfn_t +unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + /* XXX need protection */ + aliases = kvm->arch.aliases; + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (gfn >= alias->base_gfn && + gfn < alias->base_gfn + alias->npages) + return (alias->target_gfn + gfn - alias->base_gfn); } + return (gfn); +} + +int +kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], + sizeof (struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], + sizeof (struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = EINVAL; + break; + } + + return (r); +} + +int +kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + mutex_enter(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, + sizeof (struct kvm_pic_state)); + mutex_exit(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_PIC_SLAVE: + mutex_enter(&pic_irqchip(kvm)->lock); + memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, + sizeof (struct kvm_pic_state)); + mutex_exit(&pic_irqchip(kvm)->lock); + break; + case KVM_IRQCHIP_IOAPIC: + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = EINVAL; + break; + } + + kvm_pic_update_irq(pic_irqchip(kvm)); + + return (r); +} + +int +kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + struct kvm_pit *vpit = kvm->arch.vpit; + + mutex_enter(&vpit->pit_state.lock); + memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels)); + ps->flags = vpit->pit_state.flags; + mutex_exit(&vpit->pit_state.lock); + + return (0); +} + +int +kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + boolean_t prev_legacy, cur_legacy, start = B_FALSE; + struct kvm_pit *vpit = kvm->arch.vpit; + + mutex_enter(&vpit->pit_state.lock); + prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + + if (!prev_legacy && cur_legacy) + start = B_TRUE; + + memcpy(&vpit->pit_state.channels, &ps->channels, + sizeof (vpit->pit_state.channels)); + + vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start); + + mutex_exit(&vpit->pit_state.lock); + + return (0); +} + +/* TODO: As Pascal would say, we can do better */ +int +kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg) +{ + + struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg; + struct kvm_msr_list *msr_list; + size_t sz = sizeof (struct kvm_msr_list); + unsigned n; + + msr_list = kmem_zalloc(sz, KM_SLEEP); + + if (copyin(user_msr_list, msr_list, sz) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + n = msr_list->nmsrs; + msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + + if (copyout(msr_list, user_msr_list, sz) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + if (n < msr_list->nmsrs) { + kmem_free(msr_list, sz); + return (E2BIG); + } + + if (copyout(&msrs_to_save, user_msr_list->indices, + num_msrs_to_save * sizeof (uint32_t))) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + if (copyout(&emulated_msrs, user_msr_list->indices + + num_msrs_to_save, ARRAY_SIZE(emulated_msrs) * + sizeof (uint32_t)) != 0) { + kmem_free(msr_list, sz); + return (EFAULT); + } + + kmem_free(msr_list, sz); return (0); } /* - * For pages for which vmx needs physical addresses, - * linux allocates pages from an area that maps virtual - * addresses 1-1 with physical memory. In this way, - * translating virtual to physical just involves subtracting - * the start of the area from the virtual address. - * This solaris version uses kmem_alloc, so there is no - * direct mapping of virtual to physical. We'll change this - * later if performance is an issue. For now, we'll use - * hat_getpfnum() to do the conversion. Also note that - * we're assuming 64-bit address space (we won't run on - * 32-bit hardware). + * Get (and clear) the dirty memory log for a memory slot. */ -uint64_t -kvm_va2pa(caddr_t va) +int +kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - uint64_t pa; + int r, i; + struct kvm_memory_slot *memslot; + unsigned long n; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET); - return (pa); -} + mutex_enter(&kvm->slots_lock); -#ifdef XXX_KVM_DECLARATION -unsigned long *vmx_io_bitmap_a; -unsigned long *vmx_io_bitmap_b; -unsigned long *vmx_msr_bitmap_legacy; -unsigned long *vmx_msr_bitmap_longmode; + r = EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + dirty_bitmap = kmem_alloc(n, KM_SLEEP); + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n / sizeof (long); i++) + is_dirty = memslot->dirty_bitmap[i]; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + + mutex_enter(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + mutex_exit(&kvm->mmu_lock); + + slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; +#ifdef XXX + rcu_assign_pointer(kvm->memslots, slots); + kvm_synchronize_srcu_expedited(&kvm->srcu); #else -/* make these arrays to try to force into low 4GB memory... */ -/* also need to be aligned... */ -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)]; -__attribute__((__aligned__(PAGESIZE)))unsigned long - vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)]; + kvm->memslots = slots; + XXX_KVM_SYNC_PROBE; #endif + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kmem_free(old_slots, sizeof (struct kvm_memslots)); + } -struct kvm_vcpu * -kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) -{ - char buf[32]; - struct kvm_vcpu *vcpu; - kstat_t *kstat; + r = 0; + if (copyout(dirty_bitmap, log->v.dirty_bitmap, n) != 0) + r = EFAULT; +out_free: + kmem_free(dirty_bitmap, n); +out: + mutex_exit(&kvm->slots_lock); + return (r); +} - (void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid); +/* XXX kvm_arch_vm_ioctl */ - if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED, - sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL)) == NULL) { - return (NULL); +static void +kvm_init_msr_list(void) +{ + uint32_t dummy[2]; + unsigned i, j; + + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; } + num_msrs_to_save = j; +} - vcpu = kvm_x86_ops->vcpu_create(kvm, id); +static int +vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return (0); - if (vcpu == NULL) { - kstat_delete(kstat); - return (NULL); - } + return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); +} - vcpu->kvcpu_kstat = kstat; - vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats; +static int +vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) +{ + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return (0); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id"); - vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid; + return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v)); +} - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid"); - vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid; +gpa_t +kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail, - "inst-emulation-fail"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed"); - KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls"); + return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); +} - kstat_install(vcpu->kvcpu_kstat); +gpa_t +kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - return (vcpu); + access |= PFERR_WRITE_MASK; + + return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error)); } -void -kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +static int +kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t access, uint32_t *error) { - if (vcpu->arch.time_page) { - /* XXX We aren't doing anything with the time page */ - XXX_KVM_PROBE; - vcpu->arch.time_page = NULL; + uintptr_t data = (uintptr_t)val; + int r = 0; /* X86EMUL_CONTINUE */ + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, + access, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned toread = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = 1; /* X86EMUL_PROPAGATE_FAULT */ + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, (void *)data, toread); + if (ret < 0) { + r = 1; /* X86EMUL_UNHANDLEABLE */ + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; } +out: + return (r); +} - if (vcpu->kvcpu_kstat != NULL) - kstat_delete(vcpu->kvcpu_kstat); +/* used for instruction fetching */ +static int +kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; - kvm_x86_ops->vcpu_free(vcpu); + return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error)); } +static int +kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? + PFERR_USER_MASK : 0; -uint64_t -kvm_get_apic_base(struct kvm_vcpu *vcpu) + return (kvm_read_guest_virt_helper(addr, val, + bytes, vcpu, access, error)); +} + +static int +kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) { - if (irqchip_in_kernel(vcpu->kvm)) - return (vcpu->arch.apic_base); - else - return (vcpu->arch.apic_base); + return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error)); } -void -kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data) +static int +kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) { - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; + uintptr_t data = (uintptr_t)val; + + while (bytes) { + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) + return (X86EMUL_PROPAGATE_FAULT); + + if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0) + return (X86EMUL_UNHANDLEABLE); + + bytes -= towrite; + data += towrite; + addr += towrite; + } + + return (0); } -void -kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +static int +emulator_read_emulated(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; + gpa_t gpa; + uint32_t error_code; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, + vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val); + + vcpu->mmio_read_completed = 0; + return (X86EMUL_CONTINUE); } - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_tpr(vcpu, cr8); - else - vcpu->arch.cr8 = cr8; + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return (X86EMUL_PROPAGATE_FAULT); + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (kvm_read_guest_virt(addr, val, + bytes, vcpu, NULL) == X86EMUL_CONTINUE) + return (X86EMUL_CONTINUE); + +mmio: + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa, + uint64_t, *(uint64_t *)val); + return (X86EMUL_CONTINUE); + } + + KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes, + uintptr_t, gpa); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return (X86EMUL_UNHANDLEABLE); } int -is_paging(struct kvm_vcpu *vcpu) +emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) { - return (kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + int ret; + + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + + if (ret < 0) + return (0); + + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + + return (1); } -unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)]; +static int +emulator_write_emulated_onepage(unsigned long addr, const void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + uint32_t error_code; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return (X86EMUL_PROPAGATE_FAULT); + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return (X86EMUL_CONTINUE); + +mmio: + KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa, + uint64_t, *(uint64_t *)val); + + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + return (X86EMUL_CONTINUE); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return (X86EMUL_CONTINUE); +} int -kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +emulator_write_emulated(unsigned long addr, const void *val, + unsigned int bytes, struct kvm_vcpu *vcpu) { - return (kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len)); + uintptr_t data = (uintptr_t)val; + + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGEMASK) { + int rc, now; + + now = -addr & ~PAGEMASK; + rc = emulator_write_emulated_onepage(addr, + (void *)data, now, vcpu); + + if (rc != X86EMUL_CONTINUE) + return (rc); + + addr += now; + data += now; + bytes -= now; + } + + return (emulator_write_emulated_onepage(addr, val, bytes, vcpu)); +} + +static int +emulator_cmpxchg_emulated(unsigned long addr, const void *old, + const void *new, unsigned int bytes, struct kvm_vcpu *vcpu) +{ + cmn_err(CE_WARN, "kvm: emulating exchange as write\n"); +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + page_t page; + char *kaddr; + uint64_t val; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK)) + goto emul_write; + + val = *(uint64_t *)new; + + page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT); + kaddr = kmap_atomic(page, KM_USER0); + + set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + } +emul_write: +#endif + + return (emulator_write_emulated(addr, new, bytes, vcpu)); +} + +static unsigned long +get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return (kvm_x86_ops->get_segment_base(vcpu, seg)); } void -fx_init(struct kvm_vcpu *vcpu) +kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - unsigned after_mxcsr_mask; + uint8_t opcodes[4]; + unsigned long rip = kvm_rip_read(vcpu); + unsigned long rip_linear; + #ifdef XXX - /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. - */ - if (!used_math()) + if (!printk_ratelimit()) + return; #else XXX_KVM_PROBE; #endif - kvm_fx_save(&vcpu->arch.host_fx_image); - /* Initialize guest FPU by resetting ours and saving into guest's */ - kpreempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - kpreempt_enable(); + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image + - after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) - - after_mxcsr_mask); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); + + cmn_err(CE_WARN, "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); +} + +static struct x86_emulate_ops emulate_ops = { + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +static void +cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; } int -kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + uint16_t error_code, int emulation_type) { - vcpu->arch.nmi_pending = 0; - vcpu->arch.nmi_injected = 0; + int r, shadow_mask; + struct decode_cache *c; + struct kvm_run *run = vcpu->run; - vcpu->arch.switch_db_regs = 0; - memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; - vcpu->arch.dr7 = DR7_FIXED_1; + kvm_clear_exception_queue(vcpu); + vcpu->arch.mmio_fault_cr2 = cr2; - return (kvm_x86_ops->vcpu_reset(vcpu)); + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); + + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? + X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ? + X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : + cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + /* + * Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall) + */ + c = &vcpu->arch.emulate_ctxt.decode; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return (EMULATE_FAIL); + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return (EMULATE_FAIL); + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return (EMULATE_FAIL); + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return (EMULATE_FAIL); + break; + default: + return (EMULATE_FAIL); + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return (EMULATE_FAIL); + } + + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation); + + if (r) { + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail); + + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return (EMULATE_DONE); + return (EMULATE_FAIL); + } + } + + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return (EMULATE_DONE); + } + + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.string) + return (EMULATE_DO_MMIO); + + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return (EMULATE_DONE); + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return (EMULATE_FAIL); + } + + return (EMULATE_DO_MMIO); + } + + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return (EMULATE_DO_MMIO); + } + + return (EMULATE_DONE); } -struct kvm_memory_slot * -gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +static int +pio_copy_data(struct kvm_vcpu *vcpu) { - gfn = unalias_gfn(kvm, gfn); - return (gfn_to_memslot_unaliased(kvm, gfn)); + void *p = vcpu->arch.pio_data; + gva_t q = vcpu->arch.pio.guest_gva; + unsigned bytes; + int ret; + uint32_t error_code; + + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + + if (vcpu->arch.pio.in) + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); + else + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + + return (ret); } -unsigned long -kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +int +complete_pio(struct kvm_vcpu *vcpu) { - struct vm_area_struct *vma; - unsigned long addr, size; + struct kvm_pio_request *io = &vcpu->arch.pio; + long delta; + int r; + unsigned long val; - size = PAGESIZE; + if (!io->string) { + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) + goto out; + } - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return (PAGESIZE); + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } + } +out: + io->count -= io->cur_count; + io->cur_count = 0; -#ifdef XXX - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) + return (0); +} + +static int +kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) { + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + } else { + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, pd); + } + + return (r); +} + +int +kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) +{ + unsigned long val; + + DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size, + unsigned long, 1) + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE; + vcpu->run->io.count = vcpu->arch.pio.count = + vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.rep = 0; + + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + complete_pio(vcpu); + return (1); + } + + return (0); +} + +void +kvm_timer_fire(void *arg) +{ + struct kvm_timer *timer = (struct kvm_timer *)arg; + struct kvm_vcpu *vcpu = timer->vcpu; + + if (vcpu == NULL) + return; + + mutex_enter(&vcpu->kvcpu_kick_lock); + + if (timer->reinject || !timer->pending) { + atomic_add_32(&timer->pending, 1); + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + } + + timer->intervals++; + + cv_broadcast(&vcpu->kvcpu_kick_cv); + mutex_exit(&vcpu->kvcpu_kick_lock); +} + +static void +kvm_timer_init(void) +{ + int cpu; + + /* + * XXX We assume that any machine running solaris kvm + * has constant time stamp counter increment rate. + * This will be true for all but older machines. + */ + /* assume pi_clock in mhz */ + cpu_tsc_khz = (cpu_freq_hz / 1000); +} + +int +kvm_arch_init(void *opaque) +{ + int r; + struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + + if (ops->cpu_has_kvm_support()) { + cmn_err(CE_WARN, "kvm: no hardware support\n"); + r = ENOTSUP; goto out; + } + if (ops->disabled_by_bios()) { + cmn_err(CE_WARN, "kvm: disabled by bios\n"); + r = ENOTSUP; + goto out; + } - size = vma_kernel_pagesize(vma); + r = kvm_mmu_module_init(); + if (r) + goto out; + + kvm_init_msr_list(); + + kvm_x86_ops = ops; + kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_base_ptes(PT_PRESENT_MASK); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0); + + kvm_timer_init(); + + return (0); out: - up_read(¤t->mm->mmap_sem); - return (size); + return (r); +} + +int +kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits); + + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + return (1); + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return (0); + } +} + +int +kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + uint64_t param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + int fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return (0); + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RAX) & 0xffffffff); + + ingpa = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RCX) & 0xffffffff); + + outgpa = ((uint64_t)kvm_register_read(vcpu, + VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu, + VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast, + uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa, + uintptr_t, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: +#ifdef XXX + kvm_vcpu_on_spin(vcpu); #else - XXX_KVM_PROBE; - return (PAGESIZE); + XXX_KVM_PROBE; #endif + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((uint64_t)rep_done & 0xfff) << 32); + + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return (1); } -static pfn_t -hva_to_pfn(struct kvm *kvm, unsigned long addr) +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 + +int +kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - page_t page[1]; - int npages; - pfn_t pfn; - proc_t *procp = ttoproc(curthread); - struct as *as = procp->p_as; + unsigned long nr, a0, a1, a2, a3, ret; + int r = 1; + + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return (kvm_hv_hypercall(vcpu)); + + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + + KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1, + uintptr_t, a2, uintptr_t, a3); + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -EPERM; + goto out; + } + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_MMU_OP: #ifdef XXX + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); +#else + XXX_KVM_PROBE; + ret = -ENOSYS; +#endif + break; + default: + ret = -ENOSYS; + break; + } +out: + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - npages = get_user_pages_fast(addr, 1, 1, page); + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls); - if (unlikely(npages != 1)) { - struct vm_area_struct *vma; + return (r); +} - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); +static int +move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return (j); + } + } - if (vma == NULL || addr < vma->vm_start || - !(vma->vm_flags & VM_PFNMAP)) { - up_read(¤t->mm->mmap_sem); - get_page(bad_page); - return (page_to_pfn(bad_page)); + return (0); /* silence gcc, even though control never reaches here */ +} + +/* + * find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) + */ +static int +is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + uint32_t function, uint32_t index) +{ + if (e->function != function) + return (0); + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return (0); + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return (0); + return (1); +} + +struct kvm_cpuid_entry2 * +kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } - pfn = ((addr - vma->vm_start) >> PAGESHIFT) + vma->vm_pgoff; - up_read(¤t->mm->mmap_sem); - BUG_ON(!kvm_is_mmio_pfn(pfn)); - } else - pfn = page_to_pfn(page[0]); -#else - XXX_KVM_PROBE; - if (addr < kernelbase) - pfn = hat_getpfnum(as->a_hat, (caddr_t)addr); + return (best); +} + +int +cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + return (36); /* from linux. number of bits, perhaps? */ +} + +void +kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + uint32_t function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); + if (best) { + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); + } + kvm_x86_ops->skip_emulated_instruction(vcpu); + + KVM_TRACE5(cpuid, uint32_t, function, + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RAX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RBX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RCX), + uint32_t, kvm_register_read(vcpu, VCPU_REGS_RDX)); +} + +static int +dm_request_for_irq_injection(struct kvm_vcpu *vcpu) +{ + return (!irqchip_in_kernel(vcpu->kvm) && + !kvm_cpu_has_interrupt(vcpu) && + vcpu->run->request_interrupt_window && + kvm_arch_interrupt_allowed(vcpu)); +} + +static void +post_kvm_run_save(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = kvm_get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; else - pfn = hat_getpfnum(kas.a_hat, (caddr_t)addr); + kvm_run->ready_for_interrupt_injection = + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); +} + +static void +vapic_enter(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + page_t *page; + + if (!apic || !apic->vapic_addr) + return; + + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); + + vcpu->arch.apic->vapic_page = page; +} + +static void +vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int idx; + + if (!apic || !apic->vapic_addr) + return; +#ifdef XXX + idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, idx); +#else + XXX_KVM_SYNC_PROBE; #endif - return (pfn); } -pfn_t -gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static void +update_cr8_intercept(struct kvm_vcpu *vcpu) { - unsigned long addr; - pfn_t pfn; + int max_irr, tpr; - addr = gfn_to_hva(kvm, gfn); + if (!kvm_x86_ops->update_cr8_intercept) + return; - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return (page_to_pfn(bad_page)); - } + if (!vcpu->arch.apic) + return; + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; - pfn = hva_to_pfn(kvm, addr); + if (max_irr != -1) + max_irr >>= 4; + tpr = kvm_lapic_get_cr8(vcpu); - return (pfn); + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } +static void +inject_pending_event(struct kvm_vcpu *vcpu) +{ + /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + return; + } -int -is_error_pfn(pfn_t pfn) + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = 0; + vcpu->arch.nmi_injected = 1; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + 0); + kvm_x86_ops->set_irq(vcpu); + } + } +} + +static inline unsigned long +native_get_debugreg(int regno) { - return (pfn == bad_pfn); + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + __asm__("mov %%db0, %0" :"=r" (val)); + break; + case 1: + __asm__("mov %%db1, %0" :"=r" (val)); + break; + case 2: + __asm__("mov %%db2, %0" :"=r" (val)); + break; + case 3: + __asm__("mov %%db3, %0" :"=r" (val)); + break; + case 6: + __asm__("mov %%db6, %0" :"=r" (val)); + break; + case 7: + __asm__("mov %%db7, %0" :"=r" (val)); + break; + default: + cmn_err(CE_WARN, "kvm: invalid debug register retrieval, " + "regno = %d\n", regno); + } + + return (val); } -page_t * -pfn_to_page(pfn_t pfn) +static inline void +native_set_debugreg(int regno, unsigned long value) { - return (page_numtopp_nolock(pfn)); + switch (regno) { + case 0: + __asm__("mov %0, %%db0" ::"r" (value)); + break; + case 1: + __asm__("mov %0, %%db1" ::"r" (value)); + break; + case 2: + __asm__("mov %0, %%db2" ::"r" (value)); + break; + case 3: + __asm__("mov %0, %%db3" ::"r" (value)); + break; + case 6: + __asm__("mov %0, %%db6" ::"r" (value)); + break; + case 7: + __asm__("mov %0, %%db7" ::"r" (value)); + break; + default: + cmn_err(CE_WARN, "kvm: invalid debug register set, " + "regno = %d\n", regno); + } } -void -kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn) +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) + +static int +vcpu_enter_guest(struct kvm_vcpu *vcpu) { + int r; + + int req_int_win = !irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + kvm_mmu_unload(vcpu); + } + + r = kvm_mmu_reload(vcpu); + + if (r) + goto out; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, + &vcpu->requests)) { + __kvm_migrate_timers(vcpu); + } + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, + &vcpu->requests)) { + kvm_write_guest_time(vcpu); + } + + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); + + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); + + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + r = 0; + goto out; + } + + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, + &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } + } + + kpreempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); + + cli(); + + clear_bit(KVM_REQ_KICK, &vcpu->requests); #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); + smp_mb__after_clear_bit(); #else XXX_KVM_PROBE; #endif -} + if (vcpu->requests || issig(JUSTLOOKING)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); + sti(); + kpreempt_enable(); + r = 1; + goto out; + } -void -kvm_set_pfn_dirty(pfn_t pfn) -{ + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } #ifdef XXX - if (!kvm_is_mmio_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) - SetPageDirty(page); /* XXX - not defined in linux?! */ + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_PROBE; +#endif + kvm_guest_enter(); + + if (vcpu->arch.switch_db_regs) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); } + + KVM_TRACE1(vm__entry, int, vcpu->vcpu_id); + + kvm_x86_ops->run(vcpu); +#ifdef XXX + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); #else XXX_KVM_PROBE; #endif -} + set_bit(KVM_REQ_KICK, &vcpu->requests); + sti(); -int -memslot_id(struct kvm *kvm, gfn_t gfn) -{ - int i; -#ifdef XXX_KVM_DECLARATION - struct kvm_memslots *slots = rcu_dereference(kvm->memslots); +#ifdef XXX + local_irq_enable(); /* XXX - should be ok with kpreempt_enable below */ + + barrier(); #else - struct kvm_memslots *slots = kvm->memslots; + XXX_KVM_PROBE; #endif - struct kvm_memory_slot *memslot = NULL; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_exits); + kvm_guest_exit(); - gfn = unalias_gfn(kvm, gfn); - for (i = 0; i < slots->nmemslots; ++i) { - memslot = &slots->memslots[i]; + kpreempt_enable(); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) - break; + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } +#else + XXX_KVM_PROBE; +#endif + kvm_lapic_sync_from_vapic(vcpu); + r = kvm_x86_ops->handle_exit(vcpu); - return (memslot - slots->memslots); +out: + return (r); } -void -kvm_release_pfn_dirty(pfn_t pfn) +static int +__vcpu_run(struct kvm_vcpu *vcpu) { - kvm_set_pfn_dirty(pfn); - kvm_release_pfn_clean(pfn); -} + int r; + struct kvm *kvm = vcpu->kvm; -int -cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - return (36); /* from linux. number of bits, perhaps? */ -} + if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r) + return (r); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu); + else { +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + kvm_vcpu_block(vcpu); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + if (test_and_clear_bit(KVM_REQ_UNHALT, + &vcpu->requests)) { + switch (vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } + } + + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_irq_exits); + } + + if (issig(JUSTLOOKING)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_signal_exits); + } + } +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + post_kvm_run_save(vcpu); + vapic_exit(vcpu); + + return (r); +} int -kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int r; - unsigned long addr; - gfn_t gfn = gpa >> PAGESHIFT; - int offset = offset_in_page(gpa); + sigset_t sigsaved; + struct kvm_run *kvm_run = vcpu->run; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return (-EFAULT); + vcpu_load(vcpu); + if (vcpu->sigset_active) + kvm_sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) { + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + r = -EAGAIN; + goto out; + } + + /* re-sync apic's tpr */ + if (!irqchip_in_kernel(vcpu->kvm)) + kvm_set_cr8(vcpu, kvm_run->cr8); + + if (vcpu->arch.pio.cur_count) { #ifdef XXX - pagefault_disable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - - r = copyin((caddr_t)addr + offset, data, len); + r = complete_pio(vcpu); #ifdef XXX - pagefault_enable(); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); #else - XXX_KVM_PROBE; + XXX_KVM_SYNC_PROBE; #endif - if (r) - return (-EFAULT); + if (r) + goto out; + } + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +#else + XXX_KVM_SYNC_PROBE; +#endif + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#else + XXX_KVM_SYNC_PROBE; +#endif + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + r = 0; + goto out; + } + } + + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); + + r = __vcpu_run(vcpu); + +out: + if (vcpu->sigset_active) + kvm_sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + vcpu_put(vcpu); + return (r); +} + +int +kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); +#ifdef CONFIG_X86_64 + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); +#endif + + regs->rip = kvm_rip_read(vcpu); + regs->rflags = kvm_get_rflags(vcpu); + + vcpu_put(vcpu); return (0); } +int +kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); +#ifdef CONFIG_X86_64 + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); +#endif + + kvm_rip_write(vcpu, regs->rip); + kvm_set_rflags(vcpu, regs->rflags); + + vcpu->arch.exception.pending = 0; + + vcpu_put(vcpu); + + return (0); +} + +void +kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + +int +kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct descriptor_table dt; + + vcpu_load(vcpu); + + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + sregs->cr0 = kvm_read_cr0(vcpu); + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = vcpu->arch.cr3; + sregs->cr4 = kvm_read_cr4(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); + sregs->efer = vcpu->arch.efer; + sregs->apic_base = kvm_get_apic_base(vcpu); + + memset(sregs->interrupt_bitmap, 0, sizeof (sregs->interrupt_bitmap)); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) { + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); + } + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); + return (0); +} + +int +kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + vcpu->arch.mp_state = mp_state->mp_state; + vcpu_put(vcpu); + return (0); +} + +static void +kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + static void -ack_flush(void *_completed) +seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); + if (seg_desc->c.b.g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->c.b.type; + kvm_desct->present = seg_desc->c.b.p; + kvm_desct->dpl = seg_desc->c.b.dpl; + kvm_desct->db = seg_desc->c.b.d; + kvm_desct->s = seg_desc->c.b.s; + kvm_desct->l = seg_desc->c.b.l; + kvm_desct->g = seg_desc->c.b.g; + kvm_desct->avl = seg_desc->c.b.avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static void +get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, uint16_t selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int +load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + int ret; + uint32_t err; + gva_t addr; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return (1); + } + + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof (*seg_desc), + vcpu, &err); + + if (ret == 1) + kvm_inject_page_fault(vcpu, addr, err); + + return (ret); +} + +/* allowed just for 8 bytes segments */ +static int +save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return (1); + + return kvm_write_guest_virt(dtable.base + index * 8, seg_desc, + sizeof (*seg_desc), vcpu, NULL); +} + +static gpa_t +get_tss_base_addr_write(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) +{ + uint32_t base_addr = get_desc_base(seg_desc); + + return (kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL)); +} + +static gpa_t +get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) +{ + uint32_t base_addr = get_desc_base(seg_desc); + + return (kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL)); +} + +static uint16_t +get_segment_selector(struct kvm_vcpu *vcpu, int seg) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + + return (kvm_seg.selector); +} + +static int +kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return (0); +} + +static int +is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } int -make_all_cpus_request(struct kvm *kvm, unsigned int req) +kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg) { - int i; - cpuset_t set; - processorid_t me, cpu; - struct kvm_vcpu *vcpu; + struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + uint8_t dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + uint32_t err_code = 0; + int null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - CPUSET_ZERO(set); + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) + return (kvm_load_realmode_segment(vcpu, selector, seg)); - mutex_enter(&kvm->requests_lock); - me = curthread->t_cpu->cpu_id; - for (i = 0; i < kvm->online_vcpus; i++) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - break; - if (test_and_set_bit(req, &vcpu->requests)) - continue; - cpu = vcpu->cpu; - if (cpu != -1 && cpu != me) - CPUSET_ADD(set, cpu); + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || + seg == VCPU_SREG_TR) && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + + if (ret) + return (ret); + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; } - if (CPUSET_ISNULL(set)) - kvm_xcall(KVM_CPUALL, ack_flush, NULL); - else { - kpreempt_disable(); - xc_sync((xc_arg_t) ack_flush, (xc_arg_t) NULL, - 0, CPUSET2BV(set), (xc_func_t) kvm_xcall_func); - kpreempt_enable(); + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; } - mutex_exit(&kvm->requests_lock); + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.c.b.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return (0); +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); return (1); + } -void -kvm_flush_remote_tlbs(struct kvm *kvm) +static void +save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) - KVM_KSTAT_INC(kvm, kvmks_remote_tlb_flush); + tss->cr3 = vcpu->arch.cr3; + tss->eip = kvm_rip_read(vcpu); + tss->eflags = kvm_get_rflags(vcpu); + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } -gfn_t -unalias_gfn(struct kvm *kvm, gfn_t gfn) +static void +kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg) { - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} - /* XXX need protection */ - aliases = kvm->arch.aliases; +static int +load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn && - gfn < alias->base_gfn + alias->npages) - return (alias->target_gfn + gfn - alias->base_gfn); + kvm_rip_write(vcpu, tss->eip); + kvm_set_rflags(vcpu, tss->eflags | 2); + + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, + tss->ldt_selector, VCPU_SREG_LDTR)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) + return (1); + + return (0); +} + +static void +save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) +{ + tss->ip = kvm_rip_read(vcpu); + tss->flag = kvm_get_rflags(vcpu); + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static int +load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) +{ + kvm_rip_write(vcpu, tss->ip); + kvm_set_rflags(vcpu, tss->flag | 2); + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return (1); + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return (1); + + return (0); +} + +static int +kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_16, sizeof (tss_segment_16))) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, + nseg_desc), &tss_segment_16.prev_task_link, + sizeof (tss_segment_16.prev_task_link))) + goto out; } - return (gfn); + + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return (ret); +} + +static int +kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_32, sizeof (tss_segment_32))) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu, + nseg_desc), &tss_segment_32.prev_task_link, + sizeof (tss_segment_32.prev_task_link))) + goto out; + } + + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return (ret); } int -is_pse(struct kvm_vcpu *vcpu) +kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason) { - return (kvm_read_cr4_bits(vcpu, X86_CR4_PSE)); + struct kvm_segment tr_seg; + struct desc_struct cseg_desc; + struct desc_struct nseg_desc; + int ret = 0; + uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + uint32_t desc_limit; + + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); + + /* + * FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) + goto out; + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.c.b.dpl || + cpl > nseg_desc.c.b.dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return (1); + } + } + + desc_limit = get_desc_limit(&nseg_desc); + + if (!nseg_desc.c.b.p || ((desc_limit < 0x67 && + (nseg_desc.c.b.type & 8)) || desc_limit < 0x2b)) { + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); + return (1); + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.c.b.type &= ~(1 << 1); // clear the B flag + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + /* + * set back link to prev task only if NT bit is set in eflags + * note that old_tss_sel is not used afetr this point + */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (nseg_desc.c.b.type & 8) { + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + } else { + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + } + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.c.b.type |= (1 << 1); + save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + return (ret); +} + +static unsigned long +find_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + (offset/64); + unsigned long result = offset & ~(64-1); + unsigned long tmp; + + if (offset >= size) + return (size); + + size -= result; + offset %= 64; + + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~(64-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + + if (!size) + return (result); + tmp = *p; + +found_first: + tmp &= (~0UL >> (64 - size)); + if (tmp == 0UL) /* Are any bits set? */ + return (result + size); /* Nope. */ +found_middle: + return (result + __ffs(tmp)); +} + +int +kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int mmu_reset_needed = 0; + int pending_vec, max_bits; + struct descriptor_table dt; + + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + + kvm_set_cr8(vcpu, sregs->cr8); + + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_set_apic_base(vcpu, sregs->apic_base); + + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + vcpu->arch.cr0 = sregs->cr0; + + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + + if (!is_long_mode(vcpu) && is_pae(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.cr3); + mmu_reset_needed = 1; + } + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + max_bits = (sizeof (sregs->interrupt_bitmap)) << 3; + pending_vec = + find_next_bit((const unsigned long *)sregs->interrupt_bitmap, + max_bits, 0); + + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, 0); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + } + + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + update_cr8_intercept(vcpu); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + /* Older userspace won't unhalt the vcpu on reset. */ + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !is_protmode(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +#endif /* CONFIG_KVM_APIC_ARCHITECTURE */ + + vcpu_put(vcpu); + + return (0); +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + * + * 8*16 bytes for each FP-reg = 128 bytes + * 16*16 bytes for each XMM-reg=256 bytes + */ +typedef struct fxsave { + uint16_t cwd; + uint16_t swd; + uint16_t twd; + uint16_t fop; + uint64_t rip; + uint64_t rdp; + uint32_t mxcsr; + uint32_t mxcsr_mask; + uint32_t st_space[32]; +#ifdef CONFIG_X86_64 + uint32_t xmm_space[64]; +#else + uint32_t xmm_space[32]; +#endif +} fxsave_t; + +int +kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof (fxsave->xmm_space)); + + vcpu_put(vcpu); + + return (0); +} + +int +kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof (fxsave->xmm_space)); + + vcpu_put(vcpu); + + return (0); } void -kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn) +fx_init(struct kvm_vcpu *vcpu) { - if (!kvm_is_mmio_pfn(pfn)) - get_page(pfn_to_page(pfn)); + unsigned after_mxcsr_mask; +#ifdef XXX + /* + * Touch the fpu the first time in non atomic context as if + * this is the first fpu instruction the exception handler + * will fire before the instruction returns and it'll have to + * allocate ram with GFP_KERNEL. + */ + if (!used_math()) +#else + XXX_KVM_PROBE; +#endif + kvm_fx_save(&vcpu->arch.host_fx_image); + + /* Initialize guest FPU by resetting ours and saving into guest's */ + kpreempt_disable(); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + kpreempt_enable(); + + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image + + after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) - + after_mxcsr_mask); +} + +void +kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 1; + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); + KVM_TRACE1(fpu, int, 1); +} + +void +kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 0; + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_fpu_reload); + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + KVM_TRACE1(fpu, int, 0); +} + +void +kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + /* XXX We aren't doing anything with the time page */ + XXX_KVM_PROBE; + vcpu->arch.time_page = NULL; + } + + if (vcpu->kvcpu_kstat != NULL) + kstat_delete(vcpu->kvcpu_kstat); + + kvm_x86_ops->vcpu_free(vcpu); +} + +struct kvm_vcpu * +kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +{ + char buf[32]; + struct kvm_vcpu *vcpu; + kstat_t *kstat; + + (void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid); + + if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED, + sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) == NULL) { + return (NULL); + } + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + if (vcpu == NULL) { + kstat_delete(kstat); + return (NULL); + } + + vcpu->kvcpu_kstat = kstat; + vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id"); + vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid"); + vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid; + + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail, + "inst-emulation-fail"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed"); + KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls"); + + kstat_install(vcpu->kvcpu_kstat); + + return (vcpu); } int @@ -1016,94 +4825,243 @@ free_vcpu: return (r); } -void -kvm_get_kvm(struct kvm *kvm) +int +kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - atomic_inc_32(&kvm->users_count); + vcpu->arch.nmi_pending = 0; + vcpu->arch.nmi_injected = 0; + + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + + return (kvm_x86_ops->vcpu_reset(vcpu)); } -/* - * Creates some virtual cpus. Good luck creating more than one. - */ int -kvm_vm_ioctl_create_vcpu(struct kvm *kvm, uint32_t id, int *rval_p) +kvm_arch_hardware_enable(void *garbage) { - int r, i; - struct kvm_vcpu *vcpu, *v; - - vcpu = kvm_arch_vcpu_create(kvm, id); - if (vcpu == NULL) - return (EINVAL); - #ifdef XXX - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + /* + * Since this may be called from a hotplug notifcation, + * we can't get the CPU frequency directly. + */ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + int cpu = raw_smp_processor_id(); + per_cpu(cpu_tsc_khz, cpu) = 0; + } #else XXX_KVM_PROBE; #endif + kvm_shared_msr_cpu_online(); - r = kvm_arch_vcpu_setup(vcpu); - if (r) - return (r); - - mutex_enter(&kvm->lock); + return (kvm_x86_ops->hardware_enable(garbage)); +} -#ifdef XXX - if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { -#else - XXX_KVM_SYNC_PROBE; - if (kvm->online_vcpus == KVM_MAX_VCPUS) { +void +kvm_arch_hardware_disable(void *garbage) +{ + kvm_x86_ops->hardware_disable(garbage); +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + drop_user_return_notifiers(garbage); #endif - r = EINVAL; - goto vcpu_destroy; +} + +int +kvm_arch_hardware_setup(void) +{ + return (kvm_x86_ops->hardware_setup()); +} + +void +kvm_arch_check_processor_compat(void *rtn) +{ + kvm_x86_ops->check_processor_compatibility(rtn); +} + +int +kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + page_t *page; + struct kvm *kvm; + int r; + + kvm = vcpu->kvm; + + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + /* + * page = alloc_page(PAGESIZE, KM_SLEEP); + * if (!page) { + * r = ENOMEM; + * goto fail; + * } + * vcpu->arch.pio_data = page_address(page); + */ + vcpu->arch.pio_data = (caddr_t)vcpu->run + + (KVM_PIO_PAGE_OFFSET * PAGESIZE); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail; + + if (irqchip_in_kernel(kvm)) { + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; } - /* kvm_for_each_vcpu(r, v, kvm) */ - for (i = 0; i < kvm->online_vcpus; i++) { - v = kvm->vcpus[i]; - if (v->vcpu_id == id) { - r = -EEXIST; - goto vcpu_destroy; - } + vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * + sizeof (uint64_t) * 4, KM_SLEEP); + + if (!vcpu->arch.mce_banks) { + r = ENOMEM; + goto fail_free_lapic; + } + + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + return (0); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); +fail: + return (r); +} + +void +kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 * + KVM_MAX_MCE_BANKS); + kvm_free_lapic(vcpu); + kvm_mmu_destroy(vcpu); +} + +struct kvm * +kvm_arch_create_vm(void) +{ + struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP); + + if (!kvm) + return (NULL); + + if ((kvm->arch.aliases = + kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) { + kmem_free(kvm, sizeof (struct kvm)); + return (NULL); } - /* BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); */ + list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page), + offsetof(struct kvm_mmu_page, link)); - /* Now it's all set up, let userspace reach it */ - kvm_get_kvm(kvm); + list_create(&kvm->arch.assigned_dev_head, + sizeof (struct kvm_assigned_dev_kernel), + offsetof(struct kvm_assigned_dev_kernel, list)); - *rval_p = kvm->online_vcpus; /* guarantee unique id */ - vcpu->vcpu_id = *rval_p; + /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ + set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* XXX need to protect online_vcpus */ - kvm->vcpus[kvm->online_vcpus] = vcpu; + /* XXX - original is rdtscll() */ + kvm->arch.vm_init_tsc = (uint64_t)gethrtime(); -#ifdef XXX - smp_wmb(); -#else + return (kvm); +} + +static void +kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + +static void +kvm_free_vcpus(struct kvm *kvmp) +{ + int ii, maxcpus; + + maxcpus = kvmp->online_vcpus; XXX_KVM_SYNC_PROBE; -#endif - atomic_inc_32(&kvm->online_vcpus); + for (ii = 0; ii < maxcpus; ii++) + kvm_unload_vcpu_mmu(kvmp->vcpus[ii]); -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - if (kvm->bsp_vcpu_id == id) - kvm->bsp_vcpu = vcpu; -#endif + for (ii = 0; ii < maxcpus; ii++) + kvm_arch_vcpu_free(kvmp->vcpus[ii]); - mutex_exit(&kvm->lock); - return (r); + mutex_enter(&kvmp->lock); + for (ii = 0; ii < maxcpus; ii++) + kvmp->vcpus[ii] = NULL; + kvmp->online_vcpus = 0; + mutex_exit(&kvmp->lock); +} + +/* + * This function exists because of a difference in methodologies from our + * ancestor. With our ancestors, there is no imputus to clean up lists and + * mutexes. This is unfortunate, because they seem to even have debug kernels + * which would seemingly check for these kinds of things. But because in the + * common case mutex_exit is currently a #define to do {} while(0), it seems + * that they just ignore this. + * + * This leads to the following behavior: during our time we create a lot of + * auxillary structs potentially related to pits, apics, etc. Tearing down these + * structures relies on having the correct locks, etc. However + * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing + * the kmem_free. Logically these auxillary structures need to be freed and + * dealt with before we go back and do the rest of the tear down related to the + * device. + */ +void +kvm_arch_destroy_vm_comps(struct kvm *kvmp) +{ + if (kvmp == NULL) -vcpu_destroy: +#ifdef IOMMU + kvm_iommu_unmap_guest(kvmp); +#else + XXX_KVM_PROBE; +#endif /* IOMMU */ + kvm_free_pit(kvmp); + kvm_free_vcpus(kvmp); + kvm_free_physmem(kvmp); #ifdef XXX - mutex_exit(&kvm->lock); - kvm_arch_vcpu_destroy(vcpu); +#ifdef APIC + if (kvm->arch.apic_access_page) + put_page(kvm->arch.apic_access_page); + if (kvm->arch.ept_identity_pagetable) + put_page(kvm->arch.ept_identity_pagetable); +#endif /* APIC */ #else XXX_KVM_PROBE; -#endif - return (r); +#endif /* XXX */ +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + cleanup_srcu_struct(&kvm->srcu); +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ } +void +kvm_arch_destroy_vm(struct kvm *kvmp) +{ + if (kvmp == NULL) + return; /* nothing to do here */ -int kvm_arch_prepare_memory_region(struct kvm *kvm, + if (kvmp->arch.aliases) { + kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases)); + kvmp->arch.aliases = NULL; + } + kmem_free(kvmp, sizeof (struct kvm)); +} + +int +kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, struct kvm_userspace_memory_region *mem, int user_alloc) { @@ -1162,171 +5120,154 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return (0); } -/* - * Allocate some memory and give it an address in the guest physical address - * space. - * - * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding mmap_sem for write. - */ -int -kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, int user_alloc) +void +kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, + int user_alloc) { - if (mem->slot >= KVM_MEMORY_SLOTS) - return (EINVAL); - return (kvm_set_memory_region(kvm, mem, user_alloc)); -} + int npages = mem->memory_size >> PAGESHIFT; + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret = 0; -/* Caller must hold slots_lock. */ -int -kvm_io_bus_register_dev(struct kvm *kvm, - enum kvm_bus bus_idx, struct kvm_io_device *dev) -{ - struct kvm_io_bus *new_bus, *bus; +#ifdef XXX + down_write(¤t->mm->mmap_sem); + ret = munmap(old.userspace_addr, + old.npages * PAGESIZE); + up_write(¤t->mm->mmap_sem); +#else + XXX_KVM_PROBE; + /* see comment in kvm_arch_prepare_memory_region */ + /* + * XXX this needs to be here, but I'm getting kernel heap + * corruption panics with someone writing to a buffer after it + * is freed + */ + kmem_free((caddr_t)old.userspace_addr, old.npages * PAGESIZE); +#endif + if (ret < 0) { + cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + } - bus = kvm->buses[bus_idx]; - if (bus->dev_count > NR_IOBUS_DEVS-1) - return (-ENOSPC); + mutex_enter(&kvm->mmu_lock); + if (!kvm->arch.n_requested_mmu_pages) { + unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } + + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + mutex_exit(&kvm->mmu_lock); +} - new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); - if (!new_bus) - return (-ENOMEM); - memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); - new_bus->devs[new_bus->dev_count++] = dev; +void +kvm_arch_flush_shadow(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); #ifdef XXX - rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); + kvm_reload_remote_mmus(kvm); #else XXX_KVM_PROBE; - kvm->buses[bus_idx] = new_bus; #endif - if (bus) - kmem_free(bus, sizeof (struct kvm_io_bus)); - - return (0); } -/* Caller must hold slots_lock. */ int -kvm_io_bus_unregister_dev(struct kvm *kvm, - enum kvm_bus bus_idx, struct kvm_io_device *dev) +kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - int i, r; - struct kvm_io_bus *new_bus, *bus; + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || + vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || + vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))); +} - new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP); - if (!new_bus) - return (-ENOMEM); +void +kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + processorid_t cpu = vcpu->cpu; - bus = kvm->buses[bus_idx]; - memcpy(new_bus, bus, sizeof (struct kvm_io_bus)); + mutex_enter(&vcpu->kvcpu_kick_lock); - r = -ENOENT; - for (i = 0; i < new_bus->dev_count; i++) { - if (new_bus->devs[i] == dev) { - r = 0; - new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; - break; - } - } + if (CV_HAS_WAITERS(&vcpu->kvcpu_kick_cv)) + KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_wakeup); - if (r) { - kmem_free(new_bus, sizeof (struct kvm_io_bus)); - return (r); - } + cv_broadcast(&vcpu->kvcpu_kick_cv); + mutex_exit(&vcpu->kvcpu_kick_lock); -#ifdef XXX - rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); -#else - XXX_KVM_SYNC_PROBE; - kvm->buses[bus_idx] = new_bus; -#endif - kmem_free(bus, sizeof (struct kvm_io_bus)); - return (r); + if (cpu != CPU->cpu_id && cpu != -1) { + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) { + /* + * If we haven't already kicked this VCPU, we'll poke + * the the CPU on which it's running. (This will serve + * to induce a VM exit.) + */ + poke_cpu(cpu); + } + } } -long -kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) +int +kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - void *argp = (void *)arg; - int r; - proc_t *p; + return (kvm_x86_ops->interrupt_allowed(vcpu)); +} - if (kvmp->mm != curproc->p_as) - return (EIO); +unsigned long +kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; - switch (ioctl) { -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - case KVM_REGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; - r = EFAULT; - if (copyin(argp, &zone, sizeof (zone))) - goto out; - r = ENXIO; - r = kvm_vm_ioctl_register_coalesced_mmio(kvmp, &zone); - if (r) - goto out; - r = 0; - break; - } - case KVM_UNREGISTER_COALESCED_MMIO: { - struct kvm_coalesced_mmio_zone zone; - r = EFAULT; - if (copyin(argp, &zone, sizeof (zone))) - goto out; - r = ENXIO; - r = kvm_vm_ioctl_unregister_coalesced_mmio(kvmp, &zone); - if (r) - goto out; - r = 0; - break; - } -#endif -#ifdef XXX_KVM_DECLARATION - case KVM_IRQFD: { - struct kvm_irqfd data; + rflags = kvm_x86_ops->get_rflags(vcpu); - if (ddi_copyin(argp, &data, sizeof (data), mode)) - return (EFAULT); - r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags); - break; - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); - case KVM_IOEVENTFD: { - struct kvm_ioeventfd data; + return (rflags); +} - r = -EFAULT; - if (copy_from_user(&data, argp, sizeof (data))) - goto out; - r = kvm_ioeventfd(kvmp, &data); - break; +void +kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + vcpu->arch.singlestep_cs == get_segment_selector(vcpu, + VCPU_SREG_CS) && vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) { + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; } -#endif - default: - return (EINVAL); - } + kvm_x86_ops->set_rflags(vcpu, rflags); +} -out: - return (r); +inline gpa_t +gfn_to_gpa(gfn_t gfn) +{ + return ((gpa_t)gfn << PAGESHIFT); } -int -kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +/* + * For pages for which vmx needs physical addresses, + * linux allocates pages from an area that maps virtual + * addresses 1-1 with physical memory. In this way, + * translating virtual to physical just involves subtracting + * the start of the area from the virtual address. + * This solaris version uses kmem_alloc, so there is no + * direct mapping of virtual to physical. We'll change this + * later if performance is an issue. For now, we'll use + * hat_getpfnum() to do the conversion. Also note that + * we're assuming 64-bit address space (we won't run on + * 32-bit hardware). + */ +uint64_t +kvm_va2pa(caddr_t va) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || - vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || - vcpu->arch.nmi_pending || - (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))); + uint64_t pa; + + pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET); + return (pa); } void -kvm_reload_remote_mmus(struct kvm *kvm) +kvm_migrate_timers(struct kvm_vcpu *vcpu) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); } diff --git a/kvm_x86host.h b/kvm_x86host.h index 96056b6..6549ac4 100644 --- a/kvm_x86host.h +++ b/kvm_x86host.h @@ -16,6 +16,10 @@ #define offsetof(s, m) ((size_t)(&((s *)0)->m)) #endif +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P + #define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that are not exposted to userspace */ @@ -741,7 +745,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data); diff --git a/kvm_x86impl.h b/kvm_x86impl.h index 4e90237..b7c5726 100644 --- a/kvm_x86impl.h +++ b/kvm_x86impl.h @@ -29,7 +29,6 @@ inline int is_paging(struct kvm_vcpu *vcpu); caddr_t page_address(page_t *page); extern page_t *alloc_page(size_t, int); extern uint64_t kvm_va2pa(caddr_t va); -extern void bitmap_zero(unsigned long *, int); extern page_t *pfn_to_page(pfn_t); extern int zero_constructor(void *, void *, int); @@ -39,56 +38,6 @@ typedef void (*kvm_xcall_t)(void *); extern void kvm_xcall(processorid_t cpu, kvm_xcall_t func, void *arg); extern int kvm_xcall_func(kvm_xcall_t func, void *arg); -/* - * All the follwoing definitions are ones that are expected to just be in - * x86/x86.c by Linux. However we currently have the things that need them - * spread out across two files. For now we are putting them here, but this - * should not last very long. - */ -#define KVM_NR_SHARED_MSRS 16 - -typedef struct kvm_shared_msrs_global { - int nr; - uint32_t msrs[KVM_NR_SHARED_MSRS]; -} kvm_shared_msrs_global_t; - -struct kvm_vcpu; - -typedef struct kvm_user_return_notifier { - void (*on_user_return)(struct kvm_vcpu *, - struct kvm_user_return_notifier *); -} kvm_user_return_notifier_t; - -typedef struct kvm_shared_msrs { - struct kvm_user_return_notifier urn; - int registered; - struct kvm_shared_msr_values { - uint64_t host; - uint64_t curr; - } values[KVM_NR_SHARED_MSRS]; -} kvm_shared_msrs_t; - -/* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -typedef struct fxsave { - uint16_t cwd; - uint16_t swd; - uint16_t twd; - uint16_t fop; - uint64_t rip; - uint64_t rdp; - uint32_t mxcsr; - uint32_t mxcsr_mask; - uint32_t st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - uint32_t xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - uint32_t xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -} fxsave_t; - unsigned long native_read_cr0(void); #define read_cr0() (native_read_cr0()) unsigned long native_read_cr4(void); @@ -96,11 +45,11 @@ unsigned long native_read_cr4(void); unsigned long native_read_cr3(void); #define read_cr3() (native_read_cr3()) -uint32_t bit(int bitno); -inline unsigned long get_desc_limit(const struct desc_struct *desc); -unsigned long get_desc_base(const struct desc_struct *desc); - inline page_t *compound_head(page_t *page); inline void get_page(page_t *page); +inline unsigned long get_desc_limit(const struct desc_struct *desc); + +extern unsigned long get_desc_base(const struct desc_struct *); +uint32_t bit(int); #endif |