diff options
-rw-r--r-- | bitops.h | 473 | ||||
-rw-r--r-- | kvm | bin | 358152 -> 363368 bytes | |||
-rw-r--r-- | kvm.c | 70 | ||||
-rw-r--r-- | kvm.h | 6 | ||||
-rw-r--r-- | kvm_host.h | 2 | ||||
-rw-r--r-- | kvm_x86.c | 160 | ||||
-rw-r--r-- | kvm_x86host.h | 2 | ||||
-rw-r--r-- | paging_tmpl.h | 2 |
8 files changed, 622 insertions, 93 deletions
diff --git a/bitops.h b/bitops.h new file mode 100644 index 0000000..dbcdff0 --- /dev/null +++ b/bitops.h @@ -0,0 +1,473 @@ +#ifndef _ASM_X86_BITOPS_H +#define _ASM_X86_BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + * + * Note: inlines with more than a single statement should be marked + * __always_inline to avoid problems with older gcc's inlining heuristics. + */ + +#ifdef XXX +#ifndef _LINUX_BITOPS_H +#error only <linux/bitops.h> can be included directly +#endif + +#include <linux/compiler.h> +#include <asm/alternative.h> +#endif /*XXX*/ + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, 8 * sizeof(long)) + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +/* + * We do the locked ops that don't return the old value as + * a mask operation on a byte. + */ +#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK(nr) (1 << ((nr) & 7)) + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writing portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void +set_bit(unsigned int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + __asm__ volatile("lock orb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((uint8_t)CONST_MASK(nr)) + : "memory"); + } else { + __asm__ volatile("lock bts %1,%0" + : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + } +} + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + __asm__ volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void +clear_bit(int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + __asm__ volatile("lock andb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((uint8_t)~CONST_MASK(nr))); + } else { + __asm__ volatile("lock btr %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } +} + +/* + * clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and implies release semantics before the memory + * operation. It can be used for an unlock. + */ +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + clear_bit(nr, addr); +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + __asm__ volatile("btr %1,%0" : ADDR : "Ir" (nr)); +} + + +/* + * __clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * __clear_bit() is non-atomic and implies release semantics before the memory + * operation. It can be used for an unlock if no other CPUs can concurrently + * modify other bits in the word. + * + * No memory barrier is required here, because x86 cannot reorder stores past + * older loads. Same principle as spin_unlock. + */ +static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + __clear_bit(nr, addr); +} + +#define smp_mb__before_clear_bit() barrier() +#define smp_mb__after_clear_bit() barrier() + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + __asm__ volatile("btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + __asm__ volatile("lock xorb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((uint8_t)CONST_MASK(nr))); + } else { + __asm__ volatile("lock btc %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("lock bts %2,%1\n\t" + "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_set_bit_lock - Set a bit and return its old value for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This is the same as test_and_set_bit on x86. + */ +static inline int +test_and_set_bit_lock(int nr, volatile unsigned long *addr) +{ + return test_and_set_bit(nr, addr); +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__("bts %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("lock btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("lock btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +static inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr % 64)) & + (((unsigned long *)addr)[nr / 64])) != 0; +} + +static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + __asm__ volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#if 0 /* Fool kernel-doc since it doesn't do macros yet */ +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static int test_bit(int nr, const volatile unsigned long *addr); +#endif + +#define test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? constant_test_bit((nr), (addr)) \ + : variable_test_bit((nr), (addr))) + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + __asm__("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +/** + * ffz - find first zero bit in word + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static inline unsigned long ffz(unsigned long word) +{ + __asm__("bsf %1,%0" + : "=r" (word) + : "r" (~word)); + return word; +} + +/* + * __fls: find last set bit in word + * @word: The word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static inline unsigned long __fls(unsigned long word) +{ + __asm__("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +#ifdef __KERNEL__ +/** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +static inline int ffs(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + __asm__("bsfl %1,%0\n\t" + "cmovzl %2,%0" + : "=r" (r) : "rm" (x), "r" (-1)); +#else + __asm__("bsfl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} + +/** + * fls - find last set bit in word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffs, but returns the position of the most significant set bit. + * + * fls(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 32. + */ +static inline int fls(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + __asm__("bsrl %1,%0\n\t" + "cmovzl %2,%0" + : "=&r" (r) : "rm" (x), "rm" (-1)); +#else + __asm__("bsrl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} +#endif /* __KERNEL__ */ + +#undef ADDR + +#ifdef __KERNEL__ + +#include <__asm__-generic/bitops/sched.h> + +#define ARCH_HAS_FAST_MULTIPLIER 1 + +#include <__asm__-generic/bitops/hweight.h> + +#endif /* __KERNEL__ */ + +#ifdef XXX +#include <__asm__-generic/bitops/fls64.h> +#endif /*XXX*/ + +#ifdef __KERNEL__ + +#include <__asm__-generic/bitops/ext2-non-atomic.h> + +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr), (unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr), (unsigned long *)(addr)) + +#include <asm-generic/bitops/minix.h> + +#endif /* __KERNEL__ */ +#endif /* _ASM_X86_BITOPS_H */ Binary files differ@@ -63,11 +63,7 @@ found: return result + ffz(tmp); } -#ifdef XXX int largepages_enabled = 1; -#else -int largepages_enabled = 0; -#endif /*XXX*/ extern struct kvm *kvm_arch_create_vm(void); extern void kvm_arch_destroy_vm(struct kvm *kvmp); @@ -2026,9 +2022,7 @@ int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_reload_remote_mmus(struct kvm *kvm) { -#ifdef XXX make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); -#endif } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) @@ -2333,7 +2327,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return 1; - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); #ifdef XXX @@ -3460,9 +3454,6 @@ kvm_create_vm(void) return (NULL); } - list_create(&kvmp->arch.active_mmu_pages, sizeof (struct kvm_mmu_page), - offsetof(struct kvm_mmu_page, link)); - rw_init(&kvmp->kvm_rwlock, NULL, RW_DRIVER, NULL); for (i = 0; i < KVM_NR_BUSES; i++) { @@ -3949,11 +3940,7 @@ skip_lpage: kvm_arch_commit_memory_region(kvmp, mem, old, user_alloc); kvm_free_physmem_slot(&old, &new); -#ifdef NOTNOW - /* XXX this needs to be here, but I'm getting kernel heap corruption */ - /* panics with someone writing to a buffer after it is freed */ kmem_free(old_memslots, sizeof (struct kvm_memslots)); -#endif /*NOTNOW*/ if (flush_shadow) kvm_arch_flush_shadow(kvmp); @@ -5586,7 +5573,7 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) return NULL; } -static inline unsigned long bad_hva(void) +inline unsigned long bad_hva(void) { return PAGEOFFSET; } @@ -6089,6 +6076,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, unsigned long addr; addr = gfn_to_hva(kvm, gfn); + cmn_err(CE_NOTE, "kvm_write_guest_page: gfn = %lx, hva = %lx\n", gfn, addr); if (kvm_is_error_hva(addr)) return -EFAULT; /* XXX - addr could be user or kernel */ @@ -6518,17 +6506,19 @@ inline void get_page(caddr_t page) { } -extern caddr_t pfn_to_page(pfn_t pfn); +extern caddr_t pfn_to_page(struct kvm *kvm, pfn_t pfn); -inline int kvm_is_mmio_pfn(pfn_t pfn) +inline int kvm_is_mmio_pfn(struct kvm *kvm, pfn_t pfn) { #ifdef XXX if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); + struct page *page = compound_head(pfn_to_page(kvm, pfn)); return PageReserved(page); } -#endif return 1; +#else + return 0; +#endif /*XXX*/ } caddr_t gfn_to_page(struct kvm *kvm, gfn_t gfn) @@ -6537,8 +6527,8 @@ caddr_t gfn_to_page(struct kvm *kvm, gfn_t gfn) pfn = gfn_to_pfn(kvm, gfn); - if (!kvm_is_mmio_pfn(pfn)) - return pfn_to_page(pfn); + if (!kvm_is_mmio_pfn(kvm, pfn)) + return pfn_to_page(kvm, pfn); get_page(bad_page); return (caddr_t)bad_page; @@ -7348,6 +7338,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* Record the guest's net vcpu time for enforced NMI injections. */ #ifdef XXX if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked) @@ -8348,6 +8339,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, uint32_t error_code) enum emulation_result er; r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); + cmn_err(CE_NOTE, "kvm_mmu_page_fault: %p(%p, %lx, %x) returned %x\n", + vcpu->arch.mmu.page_fault, vcpu, cr2, error_code, r); if (r < 0) goto out; @@ -8355,12 +8348,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, uint32_t error_code) r = 1; goto out; } - + cmn_err(CE_CONT, "kvm_mmu_page_fault: topping up memory caches\n"); r = mmu_topup_memory_caches(vcpu); if (r) goto out; er = emulate_instruction(vcpu, cr2, error_code, 0); + cmn_err(CE_CONT, "kvm_mmu_page_fault: emulate_instruction returned %x\n", er); + switch (er) { case EMULATE_DONE: @@ -8379,6 +8374,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, uint32_t error_code) cmn_err(CE_PANIC, "kvm_mmu_page_fault: unknown return from emulate_instruction: %x\n", er); } out: + cmn_err(CE_NOTE, "kvm_mmu_page_fault: returns %d\n", r); return r; } @@ -10295,6 +10291,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) unsigned long exit_qualification; gpa_t gpa; int gla_validity; + int rval; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -10319,7 +10316,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) #ifdef XXX trace_kvm_page_fault(gpa, exit_qualification); #else - return kvm_mmu_page_fault(vcpu, gpa & PAGEMASK, 0); + rval = kvm_mmu_page_fault(vcpu, gpa & PAGEMASK, 0); + cmn_err(CE_NOTE, "handle_ept_violation: returns %d\n", rval); + return rval; #endif /*XXX*/ } @@ -10569,7 +10568,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t exit_reason = vmx->exit_reason; uint32_t vectoring_info = vmx->idt_vectoring_info; + int rval; + cmn_err(CE_NOTE, "vmx_handle_exit: exit_reason = %d, vectoring_info = %x\n", exit_reason, vectoring_info); /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); @@ -10583,6 +10584,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read16(VM_INSTRUCTION_ERROR)&0xff; + cmn_err(CE_NOTE, "vmx_handle_exit: fail = %x, failure reason = %x\n", + vmx->fail, (unsigned int)vcpu->run->fail_entry.hardware_entry_failure_reason&0xff); + return 0; } @@ -10613,9 +10617,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { + && kvm_vmx_exit_handlers[exit_reason]) { + rval = kvm_vmx_exit_handlers[exit_reason](vcpu); + cmn_err(CE_NOTE, "vmx_handle_exit: returning %d from kvm_vmx_exit_handlers[%d]\n", + rval, exit_reason); + return rval; + } else { vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = exit_reason; } @@ -11201,6 +11208,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) #endif /*XXX*/ kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + cmn_err(CE_NOTE, "vcpu_enter_guest: returning %d\n", r); out: return r; } @@ -11364,11 +11372,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) #ifdef XXX vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); #endif /*XXX*/ - /* - * XXX - the following should use a bitset_t - * and do bitset_atomic_test_and_del(). - * but I am lazy, and will get to it later - */ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -11385,8 +11388,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } } - if (r <= 0) + if (r <= 0) { + cmn_err(CE_NOTE, "__vcpu_run: r = %d\n", r); break; + } #ifdef XXX clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); @@ -11415,6 +11420,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) #endif /*XXX*/ post_kvm_run_save(vcpu); vapic_exit(vcpu); + cmn_err(CE_NOTE, "__vcpu_run: returning %d\n", r); return r; } @@ -1147,19 +1147,19 @@ struct kvm_irq_routing_table { struct kvm_shadow_walk_iterator { uint64_t addr; hpa_t shadow_addr; - int level; uint64_t *sptep; + int level; unsigned index; }; extern void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, uint64_t addr); -extern int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator); +extern int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu); extern void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator); #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ - shadow_walk_okay(&(_walker)); \ + shadow_walk_okay(&(_walker), _vcpu); \ shadow_walk_next(&(_walker))) struct kvm { @@ -147,7 +147,7 @@ void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn); -void kvm_get_pfn(pfn_t pfn); +void kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); @@ -115,11 +115,13 @@ inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGESHIFT; } +caddr_t pfn_to_page(struct kvm *kvm, pfn_t pfn); + void kvm_release_pfn_clean(pfn_t pfn) { -#ifdef XXX /*XXX probably just free the page */ +#ifdef XXX if (!kvm_is_mmio_pfn(pfn)) - put_page(pfn_to_page(pfn)); + put_page(pfn_to_page(kvm, pfn)); #endif /*XXX*/ } @@ -208,13 +210,13 @@ kvm_arch_destroy_vm(struct kvm *kvm) #ifdef XXX kvm_free_vcpus(kvm); kvm_free_physmem(kvm); -#endif #ifdef APIC if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); #endif /*APIC*/ +#endif /*XXX*/ #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) cleanup_srcu_struct(&kvm->srcu); #endif /*CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER*/ @@ -1454,8 +1456,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - kvm->arch.ept_identity_map_addr >> PAGESHIFT); + kvm->arch.ept_identity_pagetable = (caddr_t)kvm_userspace_mem.userspace_addr; out: mutex_exit(&kvm->slots_lock); return r; @@ -1466,6 +1467,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; + memset(&kvm_userspace_mem, 0, sizeof(struct kvm_userspace_memory_region)); mutex_enter(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; @@ -1477,7 +1479,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); + kvm->arch.apic_access_page = (caddr_t)kvm_userspace_mem.userspace_addr; out: mutex_exit(&kvm->slots_lock); return r; @@ -1504,7 +1506,7 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) } #endif /*NOTNOW*/ - vmx->guest_msrs = kmem_alloc(PAGESIZE, KM_SLEEP); + vmx->guest_msrs = kmem_zalloc(PAGESIZE, KM_SLEEP); if (!vmx->guest_msrs) { return NULL; /* XXX - need cleanup here */ } @@ -1576,7 +1578,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << NM_VECTOR) | (1u << DB_VECTOR); -#ifdef XXX +#ifndef XXX if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -1607,10 +1609,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, uint64_t value) return; } -#ifdef XXX if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; -#endif /*XXX*/ vcpu->arch.apic_base = value; if (apic_x2apic_mode(apic)) { @@ -1868,7 +1868,7 @@ static int init_rmode_identity_map(struct kvm *kvm) /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < PT32_ENT_PER_PAGE; i++) { tmp = (i << 22) + (PT_VALID | PT_WRITABLE | PT_USER | - PT_REF | PT_MOD | PT_PAT_4K); + PT_REF | PT_MOD | PT_PAGESIZE); r = kvm_write_guest_page(kvm, identity_map_pfn, &tmp, i * sizeof(tmp), sizeof(tmp)); if (r < 0) @@ -2118,7 +2118,7 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu) */ if (kvm_vcpu_is_bsp(&vmx->vcpu)) { vmcs_write16(GUEST_CS_SELECTOR, 0xf000); -#ifdef XXX +#ifndef XXX vmcs_writel(GUEST_CS_BASE, 0x000f0000); #else vmcs_writel(GUEST_CS_BASE, 0xffff0000); @@ -2220,10 +2220,9 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs = 0; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); -#ifdef XXX vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; -#endif /*XXX*/ + /* return kvm_x86_ops->vcpu_reset(vcpu);*/ return vmx_vcpu_reset(vcpu); } @@ -2262,8 +2261,6 @@ extern struct kmem_cache *pte_chain_cache; extern struct kmem_cache *rmap_desc_cache; extern struct kmem_cache *mmu_page_header_cache; -/*XXX the following is called for tdp (two dimensional hardware paging */ -/* we dont support this right now */ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { @@ -2515,11 +2512,47 @@ unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) return &slot->lpage_info[level - 2][idx].rmap_pde; } -void kvm_set_pfn_accessed(pfn_t pfn) +extern inline unsigned long bad_hva(void); + +/* + * XXX The following routine is misnamed. Given a gfn + * or pfn, the routine returns a virtual address that points + * to the same page as the pfn. On linux, you just use + * the kernel area mapped 1-to-1 with physical addresses, + * or use the user address stored in the memslot array. + * Right now on Solaris, all memory is allocated by the + * user level (in which case, we can use the memslot array), + * or it's allocated by the kernel, in which case we'll walk + * the kvm_mmu_page structs looking for a match. + * Either way, this routine is expensive (but how often is + * it called???). + */ +caddr_t pfn_to_page(struct kvm *kvm, pfn_t pfn) +{ + unsigned long raddr; + struct kvm_mmu_page *sp; + /* + * XXX This routine takes a page frame number and + * returns a virtual address referring to the page. + */ + raddr = gfn_to_hva(kvm, pfn); /* search memslot array */ + if (raddr == bad_hva()) { /* not in memslots...*/ + for (sp = list_head(&kvm->arch.active_mmu_pages); sp; + sp = list_next(&kvm->arch.active_mmu_pages, sp)) { + if ((sp->hpa>>PAGESHIFT) == pfn) { + raddr = *sp->spt; + break; + } + } + } + return((caddr_t)raddr); +} + +void kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn) { #ifdef XXX if (!kvm_is_mmio_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); + mark_page_accessed(pfn_to_page(kvm, pfn)); #endif /*XXX*/ } @@ -2552,11 +2585,11 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, mmu_free_rmap_desc(desc); } -void kvm_set_pfn_dirty(pfn_t pfn) +void kvm_set_pfn_dirty(struct kvm *kvm, pfn_t pfn) { #ifdef XXX if (!kvm_is_mmio_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); + struct page *page = pfn_to_page(kvm, pfn); if (!PageReserved(page)) SetPageDirty(page); } @@ -2591,9 +2624,9 @@ void rmap_remove(struct kvm *kvm, uint64_t *spte) sp = page_header(kvm_va2pa((caddr_t)spte), kvm); pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); + kvm_set_pfn_accessed(kvm, pfn); if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(pfn); + kvm_set_pfn_dirty(kvm, pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { cmn_err(CE_WARN, "rmap_remove: %p %lx 0->BUG\n", spte, *spte); @@ -2719,9 +2752,9 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) } -void kvm_release_pfn_dirty(pfn_t pfn) +void kvm_release_pfn_dirty(struct kvm_vcpu *vcpu, pfn_t pfn) { - kvm_set_pfn_dirty(pfn); + kvm_set_pfn_dirty(vcpu->kvm, pfn); kvm_release_pfn_clean(pfn); } @@ -2868,6 +2901,8 @@ int set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, spte |= (uint64_t)pfn << PAGESHIFT; + cmn_err(CE_NOTE, "set_spte: spte = %lx\n", spte); + if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { @@ -2901,7 +2936,9 @@ int set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: + cmn_err(CE_CONT, "set_spte: calling __set_spte with sptep = %p, spte = %lx\n", sptep, spte); __set_spte(sptep, spte); + cmn_err(CE_CONT, "set_spte: returning %x\n", ret); return ret; } @@ -2917,8 +2954,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, int was_rmapped = 0; int was_writable = is_writable_pte(*sptep); int rmap_count; - + cmn_err(CE_NOTE, "mmu_set_spte: vcpu = %p sptep = %p, level = %x, gfn = %lx\n", + vcpu, sptep, level, gfn); + cmn_err(CE_CONT, "mmu_set_spte: pfn = %lx, *sptep = %lx\n", pfn, *sptep); if (is_rmap_spte(*sptep)) { + cmn_err(CE_CONT, "mmu_set_spte: is_rmap_spte is true\n"); /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2929,16 +2969,21 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, uint64_t pte = *sptep; child = page_header(pte & PT64_BASE_ADDR_MASK, vcpu->kvm); + cmn_err(CE_CONT, "mmu_set_spte: child = %p, pte %lx, removing parent\n", child, pte); mmu_page_remove_parent_pte(child, sptep); } else if (pfn != spte_to_pfn(*sptep)) { + cmn_err(CE_CONT, "mmu_set_spte: removing rmap for pfn = %lx, spte_to_pfn = %lx\n", + pfn, spte_to_pfn(*sptep)); rmap_remove(vcpu->kvm, sptep); } else was_rmapped = 1; } + cmn_err(CE_CONT, "mmu_set_spte: calling set_spte...\n"); if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, dirty, level, gfn, pfn, speculative, 1, reset_host_protection)) { + cmn_err(CE_CONT, "mmu_set_spte: set_spte returned non-null\n"); if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -2949,17 +2994,22 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, ++vcpu->kvm->stat.lpages; #endif /*XXX*/ + cmn_err(CE_CONT, "mmu_set_spte: calling page_header_update_slot, kvm = %p, sptep = %p, gfn = %lx\n", + vcpu->kvm, sptep, gfn); page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); + cmn_err(CE_CONT, "mmu_set_spte: added rmap for vcpu = %p, sptep = %p, gfn = %lx, rmap_count = %d\n", + vcpu, sptep, gfn, rmap_count); kvm_release_pfn_clean(pfn); #ifdef XXX if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); #endif /*XXX*/ } else { + cmn_err(CE_CONT, "mmu_set_spte: releasing pfn = %lx, was_writable = %x\n", pfn, was_writable); if (was_writable) - kvm_release_pfn_dirty(pfn); + kvm_release_pfn_dirty(vcpu, pfn); else kvm_release_pfn_clean(pfn); } @@ -3045,6 +3095,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); if (!sp) { + cmn_err(CE_WARN, "nonpaging_map: ENOMEM\n"); kvm_release_pfn_clean(pfn); return -ENOMEM; } @@ -3088,7 +3139,6 @@ inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, uint32_t error_code) { -#ifdef XXX pfn_t pfn; int r; int level; @@ -3106,8 +3156,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); +#ifdef XXX mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); +#endif /*XXX*/ pfn = gfn_to_pfn(vcpu->kvm, gfn); if (is_error_pfn(pfn)) { @@ -3129,7 +3181,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, out_unlock: mutex_exit(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); -#endif /*XXX*/ return 0; } @@ -3282,18 +3333,6 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) } } -caddr_t pfn_to_page(pfn_t pfn) -{ - /* - * XXX This routine takes a page frame number and - * returns a virtual address referring to the page. - */ - return (caddr_t)NULL; /* XXX fix me!!! */ -} - - - - void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, uint64_t addr) { @@ -3310,8 +3349,10 @@ void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, } } -int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu) { + struct kvm_mmu_page *sp; + if (iterator->level < PT_PAGE_TABLE_LEVEL) return 0; @@ -3320,7 +3361,25 @@ int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) return 0; iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); - iterator->sptep = ((uint64_t *)(iterator->shadow_addr)) + iterator->index; + cmn_err(CE_NOTE, "iterator->level = %x, iterator->shadow_addr = %lx, iterator->addr = %lx\n", + iterator->level, iterator->shadow_addr, iterator->addr); + cmn_err(CE_CONT, "iterator->index = %x\n", iterator->index); +#ifdef XXX + iterator->sptep = ((uint64_t *)__va(iterator->shadow_addr)) + iterator->index; +#else + for (sp = list_head(&vcpu->kvm->arch.active_mmu_pages); sp; + sp = list_next(&vcpu->kvm->arch.active_mmu_pages, sp)) { + if (sp->hpa == iterator->shadow_addr) { + iterator->sptep = ((uint64_t *)sp->spt) + iterator->index ; + cmn_err(CE_CONT, "sp = %p, spt = %p, sptep = %p\n", sp, sp->spt, iterator->sptep); + break; + } + } + if (!sp) { + cmn_err(CE_NOTE, "shadow_addr %lx not in mmu_page_list\n", iterator->shadow_addr); + return 0; + } +#endif /*XXX*/ return 1; } @@ -3425,12 +3484,10 @@ gfn_t pse36_gfn_delta(uint32_t gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } -void kvm_get_pfn(pfn_t pfn) +void kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn) { -#ifdef XXX if (!kvm_is_mmio_pfn(pfn)) - get_page(pfn_to_page(pfn)); -#endif /*XXX*/ + get_page(pfn_to_page(vcpu->kvm, pfn)); } #define PTTYPE 64 @@ -3654,18 +3711,9 @@ int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = -1; /* bad_pfn */ -#ifdef XXX - /* - * XXX currently, we won't support 2 dimensional paging. - * So the hardware will not do guest-virtual to guest-physical - * and guest-physical to host physical. So we'll need to - * implement "shadow" paging... - */ - if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else -#endif return init_kvm_softmmu(vcpu); return 0; } @@ -3837,6 +3885,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!userspace_addr) return -ENOMEM; memslot->userspace_addr = (unsigned long) userspace_addr; + mem->userspace_addr = (unsigned long) userspace_addr; + } #endif /*DO_MMAP_SOLARIS*/ #endif /*XXX*/ diff --git a/kvm_x86host.h b/kvm_x86host.h index bbbff31..dab0002 100644 --- a/kvm_x86host.h +++ b/kvm_x86host.h @@ -507,7 +507,7 @@ struct kvm_arch { /* * Hash table of struct kvm_mmu_page. */ - list_t active_mmu_pages; + list_t active_mmu_pages; /* list of all kvm_mmu_page */ list_t assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; diff --git a/paging_tmpl.h b/paging_tmpl.h index b8917ce..7c50fcd 100644 --- a/paging_tmpl.h +++ b/paging_tmpl.h @@ -303,7 +303,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) return; #endif - kvm_get_pfn(pfn); + kvm_get_pfn(vcpu, pfn); /* * we call mmu_set_spte() with reset_host_protection = 1 beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |