diff options
Diffstat (limited to 'linux/x86/kvm_main.c')
-rw-r--r-- | linux/x86/kvm_main.c | 607 |
1 files changed, 150 insertions, 457 deletions
diff --git a/linux/x86/kvm_main.c b/linux/x86/kvm_main.c index 355948e..19f1924 100644 --- a/linux/x86/kvm_main.c +++ b/linux/x86/kvm_main.c @@ -45,7 +45,6 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -95,12 +94,11 @@ #include <asm-generic/bitops/le.h> #include "coalesced_mmio.h" -#include "async_pf.h" #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> -MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_INFO(version, "kvm-kmod-2.6.34"); MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -131,40 +129,15 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); +static bool kvm_rebooting; static bool largepages_enabled = true; -static struct page *hwpoison_page; -static pfn_t hwpoison_pfn; - -static struct page *fault_page; -static pfn_t fault_pfn; - inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - int reserved; - struct page *tail = pfn_to_page(pfn); - struct page *head = compound_trans_head(tail); - reserved = PageReserved(head); - if (head != tail) { - /* - * "head" is not a dangling pointer - * (compound_trans_head takes care of that) - * but the hugepage may have been splitted - * from under us (and we may not hold a - * reference count on the head page so it can - * be reused before we run PageReferenced), so - * we've to check PageTail before returning - * what we just read. - */ - smp_rmb(); - if (PageTail(tail)) - return reserved; - } - return PageReserved(tail); + struct page *page = compound_head(pfn_to_page(pfn)); + return PageReserved(page); } return true; @@ -210,7 +183,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) raw_spin_lock(&kvm->requests_lock); me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (kvm_make_check_request(req, vcpu)) + if (test_and_set_bit(req, &vcpu->requests)) continue; cpu = vcpu->cpu; if (cpus != NULL && cpu != -1 && cpu != me) @@ -229,12 +202,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - int dirty_count = kvm->tlbs_dirty; - - smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } void kvm_reload_remote_mmus(struct kvm *kvm) @@ -252,7 +221,6 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; init_waitqueue_head(&vcpu->wq); - kvm_async_pf_vcpu_init(vcpu); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { @@ -311,12 +279,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, * pte after kvm_unmap_hva returned, without noticing the page * is going to be freed. */ - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; + need_tlb_flush = kvm_unmap_hva(kvm, address); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -335,12 +303,12 @@ void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; kvm_set_spte_hva(kvm, address, pte); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, @@ -351,7 +319,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no @@ -361,9 +329,8 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); - need_tlb_flush |= kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -403,11 +370,11 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); young = kvm_age_hva(kvm, address); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); if (young) kvm_flush_remote_tlbs(kvm); @@ -415,33 +382,15 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) -static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - return young; -} -#endif - static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow(kvm); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { @@ -449,9 +398,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) - .test_young = kvm_mmu_notifier_test_young, -#endif #ifdef MMU_NOTIFIER_HAS_CHANGE_PTE .change_pte = kvm_mmu_notifier_change_pte, #endif @@ -475,15 +421,11 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) static struct kvm *kvm_create_vm(void) { - int r, i; - struct kvm *kvm = kvm_arch_alloc_vm(); - - if (!kvm) - return ERR_PTR(-ENOMEM); + int r = 0, i; + struct kvm *kvm = kvm_arch_create_vm(); - r = kvm_arch_init_vm(kvm); - if (r) - goto out_err_nodisable; + if (IS_ERR(kvm)) + goto out; r = hardware_enable_all(); if (r) @@ -497,19 +439,23 @@ static struct kvm *kvm_create_vm(void) r = -ENOMEM; kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) - goto out_err_nosrcu; - if (init_srcu_struct(&kvm->srcu)) - goto out_err_nosrcu; + goto out_err; + if (kvm_init_srcu_struct(&kvm->srcu)) + goto out_err; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); - if (!kvm->buses[i]) + if (!kvm->buses[i]) { + kvm_cleanup_srcu_struct(&kvm->srcu); goto out_err; + } } r = kvm_init_mmu_notifier(kvm); - if (r) + if (r) { + kvm_cleanup_srcu_struct(&kvm->srcu); goto out_err; + } kvm->mm = current->mm; mmget(&kvm->mm->mm_count); @@ -523,35 +469,22 @@ static struct kvm *kvm_create_vm(void) spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); - +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + kvm_coalesced_mmio_init(kvm); +#endif +out: return kvm; out_err: - cleanup_srcu_struct(&kvm->srcu); -out_err_nosrcu: hardware_disable_all(); out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); - kvm_arch_free_vm(kvm); + kfree(kvm); return ERR_PTR(r); } -static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) -{ - if (!memslot->dirty_bitmap) - return; - - if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap_head); - else - kfree(memslot->dirty_bitmap_head); - - memslot->dirty_bitmap = NULL; - memslot->dirty_bitmap_head = NULL; -} - /* * Free any memory in @free but not in @dont. */ @@ -564,7 +497,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - kvm_destroy_dirty_bitmap(free); + vfree(free->dirty_bitmap); for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { @@ -575,6 +508,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, } free->npages = 0; + free->dirty_bitmap = NULL; free->rmap = NULL; } @@ -608,9 +542,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); - kvm_free_physmem(kvm); - cleanup_srcu_struct(&kvm->srcu); - kvm_arch_free_vm(kvm); hardware_disable_all(); mmdrop(mm); } @@ -640,27 +571,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) } /* - * Allocation size is twice as large as the actual dirty bitmap size. - * This makes it possible to do double buffering: see x86's - * kvm_vm_ioctl_get_dirty_log(). - */ -static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) -{ - unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - - if (dirty_bytes > PAGE_SIZE) - memslot->dirty_bitmap = vzalloc(dirty_bytes); - else - memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); - - if (!memslot->dirty_bitmap) - return -ENOMEM; - - memslot->dirty_bitmap_head = memslot->dirty_bitmap; - return 0; -} - -/* * Allocate some memory and give it an address in the guest physical address * space. * @@ -697,16 +607,11 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; - if (npages > KVM_MEM_MAX_NR_PAGES) - goto out; - if (!npages) mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; new = old = *memslot; - new.id = mem->slot; new.base_gfn = base_gfn; new.npages = npages; new.flags = mem->flags; @@ -737,11 +642,13 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate if a slot is being created */ #ifndef CONFIG_S390 if (npages && !new.rmap) { - new.rmap = vzalloc(npages * sizeof(*new.rmap)); + new.rmap = vmalloc(npages * sizeof(struct page *)); if (!new.rmap) goto out_free; + memset(new.rmap, 0, npages * sizeof(*new.rmap)); + new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; } @@ -760,18 +667,21 @@ int __kvm_set_memory_region(struct kvm *kvm, if (new.lpage_info[i]) continue; - lpages = 1 + ((base_gfn + npages - 1) - >> KVM_HPAGE_GFN_SHIFT(level)); - lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); - new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); if (!new.lpage_info[i]) goto out_free; - if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); + + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) new.lpage_info[i][0].write_count = 1; - if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* @@ -789,8 +699,12 @@ skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - if (kvm_create_dirty_bitmap(&new) < 0) + unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); /* destroy any largepage mappings for dirty tracking */ if (old.npages) flush_shadow = 1; @@ -809,7 +723,6 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; - slots->generation++; slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; old_memslots = kvm->memslots; @@ -830,12 +743,14 @@ skip_lpage: if (r) goto out_free; +#ifdef CONFIG_DMAR /* map the pages in iommu page table */ if (npages) { r = kvm_iommu_map_pages(kvm, &new); if (r) goto out_free; } +#endif r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); @@ -844,7 +759,6 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; - slots->generation++; /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { @@ -942,28 +856,16 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages); int is_error_page(struct page *page) { - return page == bad_page || page == hwpoison_page || page == fault_page; + return page == bad_page; } EXPORT_SYMBOL_GPL(is_error_page); int is_error_pfn(pfn_t pfn) { - return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; + return pfn == bad_pfn; } EXPORT_SYMBOL_GPL(is_error_pfn); -int is_hwpoison_pfn(pfn_t pfn) -{ - return pfn == hwpoison_pfn; -} -EXPORT_SYMBOL_GPL(is_hwpoison_pfn); - -int is_fault_pfn(pfn_t pfn) -{ - return pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_fault_pfn); - static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -975,10 +877,10 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, - gfn_t gfn) +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -989,18 +891,20 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, } return NULL; } +EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_memslot(kvm_memslots(kvm), gfn); + gfn = unalias_gfn(kvm, gfn); + return gfn_to_memslot_unaliased(kvm, gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); + gfn = unalias_gfn_instantiation(kvm, gfn); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -1042,9 +946,10 @@ out: int memslot_id(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); struct kvm_memory_slot *memslot = NULL; + gfn = unalias_gfn(kvm, gfn); for (i = 0; i < slots->nmemslots; ++i) { memslot = &slots->memslots[i]; @@ -1056,179 +961,76 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) return memslot - slots->memslots; } -static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, - gfn_t *nr_pages) +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; + + gfn = unalias_gfn_instantiation(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); - - if (nr_pages) - *nr_pages = slot->npages - (gfn - slot->base_gfn); - - return gfn_to_hva_memslot(slot, gfn); -} - -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t get_fault_pfn(void) -{ - get_page(fault_page); - return fault_pfn; -} - -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, - bool *async, bool write_fault, bool *writable) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) { struct page *page[1]; - int npages = 0; + int npages; pfn_t pfn; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - - BUG_ON(!write_fault && !writable); - - if (writable) - *writable = true; + might_sleep(); - if (atomic || async) - npages = kvm___get_user_pages_fast(addr, 1, 1, page); - - if (unlikely(npages != 1) && !atomic) { - might_sleep(); - - if (writable) - *writable = write_fault; - - npages = get_user_pages_fast(addr, 1, write_fault, page); - - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && npages == 1) { - struct page *wpage[1]; - - npages = kvm___get_user_pages_fast(addr, 1, 1, wpage); - if (npages == 1) { - *writable = true; - put_page(page[0]); - page[0] = wpage[0]; - } - npages = 1; - } - } + npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { struct vm_area_struct *vma; - if (atomic) - return get_fault_pfn(); - down_read(¤t->mm->mmap_sem); - if (is_hwpoison_address(addr)) { + vma = find_vma(current->mm, addr); + + if (vma == NULL || addr < vma->vm_start || + !(vma->vm_flags & VM_PFNMAP)) { up_read(¤t->mm->mmap_sem); - get_page(hwpoison_page); - return page_to_pfn(hwpoison_page); + get_page(bad_page); + return page_to_pfn(bad_page); } - vma = find_vma_intersection(current->mm, addr, addr+1); - - if (vma == NULL) - pfn = get_fault_pfn(); - else if ((vma->vm_flags & VM_PFNMAP)) { - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_mmio_pfn(pfn)); - } else { - if (async && (vma->vm_flags & VM_WRITE)) - *async = true; - pfn = get_fault_pfn(); - } + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; up_read(¤t->mm->mmap_sem); + BUG_ON(!kvm_is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); return pfn; } -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) -{ - return hva_to_pfn(kvm, addr, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); - -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, - bool write_fault, bool *writable) +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { unsigned long addr; - if (async) - *async = false; - addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) { get_page(bad_page); return page_to_pfn(bad_page); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) - async = NULL; -#endif - return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); -} - -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) -{ - return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); - -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable) -{ - return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_async); - -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); + return hva_to_pfn(kvm, addr); } EXPORT_SYMBOL_GPL(gfn_to_pfn); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) +static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false, NULL, true, NULL); + return hva_to_pfn(kvm, addr); } -int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, - int nr_pages) -{ - unsigned long addr; - gfn_t entry; - - addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); - if (kvm_is_error_hva(addr)) - return -1; - - if (entry < nr_pages) - return 0; - - return kvm___get_user_pages_fast(addr, nr_pages, 1, pages); -} -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); - struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; @@ -1402,51 +1204,9 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return 0; } -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - int offset = offset_in_page(gpa); - gfn_t gfn = gpa >> PAGE_SHIFT; - - ghc->gpa = gpa; - ghc->generation = slots->generation; - ghc->memslot = __gfn_to_memslot(slots, gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); - if (!kvm_is_error_hva(ghc->hva)) - ghc->hva += offset; - else - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); - -int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - int r; - - if (slots->generation != ghc->generation) - kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); - - if (kvm_is_error_hva(ghc->hva)) - return -EFAULT; - - r = copy_to_user((void *)ghc->hva, data, len); - if (r) - return -EFAULT; - mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); - int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, - offset, len); + return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1469,24 +1229,24 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn) +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *memslot; + + gfn = unalias_gfn(kvm, gfn); + memslot = gfn_to_memslot_unaliased(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; + unsigned long *p = memslot->dirty_bitmap + + rel_gfn / BITS_PER_LONG; + int offset = rel_gfn % BITS_PER_LONG; - generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + /* avoid RMW */ + if (!generic_test_le_bit(offset, p)) + generic___set_le_bit(offset, p); } } -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *memslot; - - memslot = gfn_to_memslot(kvm, gfn); - mark_page_dirty_in_slot(kvm, memslot, gfn); -} - /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ @@ -1498,7 +1258,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); if (kvm_arch_vcpu_runnable(vcpu)) { - kvm_make_request(KVM_REQ_UNHALT, vcpu); + set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } if (kvm_cpu_has_pending_timer(vcpu)) @@ -1558,7 +1318,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct kvm_vcpu_vm_ops = { - .fault = kvm_vcpu_fault, + .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vcpu_fault), }; static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) @@ -1580,9 +1340,6 @@ static struct file_operations kvm_vcpu_fops = { .unlocked_ioctl = kvm_vcpu_ioctl, .compat_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; /* @@ -1590,7 +1347,7 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - return kvm_anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); } /* @@ -1672,25 +1429,12 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; - -#if defined(CONFIG_S390) || defined(CONFIG_PPC) - /* - * Special cases: vcpu ioctls that are asynchronous to vcpu execution, - * so vcpu_load() would break it. - */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) - return kvm_arch_vcpu_ioctl(filp, ioctl, arg); -#endif - - - vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: r = -EINVAL; if (arg) goto out; r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; case KVM_GET_REGS: { struct kvm_regs *kvm_regs; @@ -1827,7 +1571,7 @@ out_free2: goto out; p = &sigset; } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); break; } case KVM_GET_FPU: { @@ -1862,7 +1606,6 @@ out_free2: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: - vcpu_put(vcpu); kfree(fpu); kfree(kvm_sregs); return r; @@ -1913,6 +1656,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; + r = -ENXIO; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -1924,6 +1668,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; + r = -ENXIO; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -2041,7 +1786,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, + .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vm_fault), }; static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) @@ -2057,31 +1802,21 @@ static struct file_operations kvm_vm_fops = { .compat_ioctl = kvm_vm_compat_ioctl, #endif .mmap = kvm_vm_mmap, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; static int kvm_dev_ioctl_create_vm(void) { - int r; + int fd; struct kvm *kvm; kvm = kvm_create_vm(); if (IS_ERR(kvm)) return PTR_ERR(kvm); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - r = kvm_coalesced_mmio_init(kvm); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } -#endif - r = kvm_anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); - if (r < 0) + fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (fd < 0) kvm_put_kvm(kvm); - return r; + return fd; } static long kvm_dev_ioctl_check_extension_generic(long arg) @@ -2153,9 +1888,6 @@ out: static struct file_operations kvm_chardev_ops = { .unlocked_ioctl = kvm_dev_ioctl, .compat_ioctl = kvm_dev_ioctl, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; static struct miscdevice kvm_dev = { @@ -2164,7 +1896,7 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) +static void hardware_enable(void *junk) { int cpu = raw_smp_processor_id(); int r; @@ -2184,14 +1916,7 @@ static void hardware_enable_nolock(void *junk) } } -static void hardware_enable(void *junk) -{ - spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - spin_unlock(&kvm_lock); -} - -static void hardware_disable_nolock(void *junk) +static void hardware_disable(void *junk) { int cpu = raw_smp_processor_id(); @@ -2201,20 +1926,13 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(NULL); } -static void hardware_disable(void *junk) -{ - spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - spin_unlock(&kvm_lock); -} - static void hardware_disable_all_nolock(void) { BUG_ON(!kvm_usage_count); kvm_usage_count--; if (!kvm_usage_count) - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); } static void hardware_disable_all(void) @@ -2233,7 +1951,7 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage_count == 1) { atomic_set(&hardware_enable_failed, 0); - kvm_on_each_cpu(hardware_enable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_enable, NULL, 1); if (atomic_read(&hardware_enable_failed)) { hardware_disable_all_nolock(); @@ -2261,30 +1979,31 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, cpu); hardware_disable(NULL); break; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) - case CPU_STARTING: -#else + case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_disable, NULL, 1); + break; case CPU_ONLINE: -#endif printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) - hardware_enable(NULL); -#else smp_call_function_single(cpu, hardware_enable, NULL, 1); -#endif break; } return NOTIFY_OK; } -asmlinkage void kvm_spurious_fault(void) +asmlinkage void kvm_handle_fault_on_reboot(void) { + if (kvm_rebooting) + /* spin while reset goes on */ + while (true) + ; /* Fault while not rebooting. We want the trace. */ BUG(); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); static int kvm_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -2297,7 +2016,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, */ printk(KERN_INFO "kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); return NOTIFY_OK; } @@ -2323,9 +2042,7 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { int i; - struct kvm_io_bus *bus; - - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) return 0; @@ -2337,9 +2054,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { int i; - struct kvm_io_bus *bus; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) return 0; @@ -2403,6 +2119,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, + .priority = 20, /* must be > scheduler priority */ }; static int __vm_stat_get(void *_offset, u64 *val) @@ -2469,16 +2186,14 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { if (kvm_usage_count) - hardware_disable_nolock(NULL); + hardware_disable(NULL); return 0; } static int kvm_resume(struct sys_device *dev) { - if (kvm_usage_count) { - WARN_ON(spin_is_locked(&kvm_lock)); - hardware_enable_nolock(NULL); - } + if (kvm_usage_count) + hardware_enable(NULL); return 0; } @@ -2518,17 +2233,22 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_fire_urn(); } -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, +int kvm_init(void *opaque, unsigned int vcpu_size, struct module *module) { int r; int cpu; - r = kvm_init_srcu(); + r = kvm_init_anon_inodes(); if (r) return r; + r = kvm_init_srcu(); + if (r) + goto cleanup_anon_inodes; + preempt_notifier_sys_init(); + hrtimer_kallsyms_resolve(); r = kvm_arch_init(opaque); if (r) @@ -2543,24 +2263,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, bad_pfn = page_to_pfn(bad_page); - hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (hwpoison_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - hwpoison_pfn = page_to_pfn(hwpoison_page); - - fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (fault_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - fault_pfn = page_to_pfn(fault_page); - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -2592,19 +2294,14 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_4; /* A kmem cache lets us meet the alignment requirements of fx_save. */ - if (!vcpu_align) - vcpu_align = __alignof__(struct kvm_vcpu); - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), 0, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; goto out_free_5; } - r = kvm_async_pf_init(); - if (r) - goto out_free; - kvm_chardev_ops.owner = module; IF_ANON_INODES_DOES_REFCOUNTS( kvm_vm_fops.owner = module;) IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) @@ -2612,7 +2309,7 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) r = misc_register(&kvm_dev); if (r) { printk(KERN_ERR "kvm: misc device register failed\n"); - goto out_unreg; + goto out_free; } kvm_preempt_ops.sched_in = kvm_sched_in; @@ -2620,14 +2317,12 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) kvm_init_debug(); - printk("loaded kvm module (kvm-kmod-2.6.38-rc7)\n"); + printk("loaded kvm module (kvm-kmod-2.6.34)\n"); kvm_clock_warn_suspend_bug(); return 0; -out_unreg: - kvm_async_pf_deinit(); out_free: kmem_cache_destroy(kvm_vcpu_cache); out_free_5: @@ -2643,37 +2338,35 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: - if (fault_page) - __free_page(fault_page); - if (hwpoison_page) - __free_page(hwpoison_page); __free_page(bad_page); out: kvm_arch_exit(); out_fail: preempt_notifier_sys_exit(); kvm_exit_srcu(); +cleanup_anon_inodes: + kvm_exit_anon_inodes(); return r; } EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { + tracepoint_synchronize_unregister(); kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); - kvm_async_pf_deinit(); sysdev_unregister(&kvm_sysdev); sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); - __free_page(hwpoison_page); __free_page(bad_page); preempt_notifier_sys_exit(); kvm_exit_srcu(); + kvm_exit_anon_inodes(); } EXPORT_SYMBOL_GPL(kvm_exit); |