diff options
Diffstat (limited to 'usr/src/uts/i86pc/vm/hat_i86.c')
-rw-r--r-- | usr/src/uts/i86pc/vm/hat_i86.c | 519 |
1 files changed, 351 insertions, 168 deletions
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 53e42e74e4..790007b79b 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -72,6 +72,9 @@ #include <vm/seg_kp.h> #include <vm/seg_kpm.h> #include <vm/vm_dep.h> +#ifdef __xpv +#include <sys/hypervisor.h> +#endif #include <vm/kboot_mmu.h> #include <vm/seg_spt.h> @@ -85,29 +88,15 @@ struct hat_mmu_info mmu; /* * The page that is the kernel's top level pagetable. * - * For 32 bit VLP support, the kernel hat will use the 1st 4 entries + * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries * on this 4K page for its top level page table. The remaining groups of * 4 entries are used for per processor copies of user VLP pagetables for * running threads. See hat_switch() and reload_pae32() for details. * - * vlp_page[0] - 0th level==2 PTE for kernel HAT (will be zero) - * vlp_page[1] - 1st level==2 PTE for kernel HAT (will be zero) - * vlp_page[2] - 2nd level==2 PTE for kernel HAT (zero for small memory) - * vlp_page[3] - 3rd level==2 PTE for kernel - * - * vlp_page[4] - 0th level==2 PTE for user thread on cpu 0 - * vlp_page[5] - 1st level==2 PTE for user thread on cpu 0 - * vlp_page[6] - 2nd level==2 PTE for user thread on cpu 0 - * vlp_page[7] - probably copy of kernel PTE - * - * vlp_page[8] - 0th level==2 PTE for user thread on cpu 1 - * vlp_page[9] - 1st level==2 PTE for user thread on cpu 1 - * vlp_page[10] - 2nd level==2 PTE for user thread on cpu 1 - * vlp_page[11] - probably copy of kernel PTE - * ... - * - * when / where the kernel PTE's are (entry 2 or 3 or none) depends - * on kernelbase. + * vlp_page[0..3] - level==2 PTEs for kernel HAT + * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0 + * vlp_page[8..11] - level==2 PTE for user thread on cpu 1 + * etc... */ static x86pte_t *vlp_page; @@ -119,27 +108,24 @@ static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, /* * The kernel address space exists in all HATs. To implement this the - * kernel reserves a fixed number of entries in every topmost level page - * table. The values are setup in hat_init() and then copied to every hat - * created by hat_alloc(). This means that kernelbase must be: + * kernel reserves a fixed number of entries in the topmost level(s) of page + * tables. The values are setup during startup and then copied to every user + * hat created by hat_alloc(). This means that kernelbase must be: * * 4Meg aligned for 32 bit kernels * 512Gig aligned for x86_64 64 bit kernel * - * The PAE 32 bit hat is handled as a special case. Otherwise requiring 1Gig - * alignment would use too much VA for the kernel. - * + * The hat_kernel_range_ts describe what needs to be copied from kernel hat + * to each user hat. */ -static uint_t khat_start; /* index of 1st entry in kernel's top ptable */ -static uint_t khat_entries; /* number of entries in kernel's top ptable */ - -#if defined(__i386) - -static htable_t *khat_pae32_htable = NULL; -static uint_t khat_pae32_start; -static uint_t khat_pae32_entries; - -#endif +typedef struct hat_kernel_range { + level_t hkr_level; + uintptr_t hkr_start_va; + uintptr_t hkr_end_va; /* zero means to end of memory */ +} hat_kernel_range_t; +#define NUM_KERNEL_RANGE 2 +static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE]; +static int num_kernel_ranges; uint_t use_boot_reserve = 1; /* cleared after early boot process */ uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */ @@ -214,9 +200,16 @@ hati_constructor(void *buf, void *handle, int kmflags) hat_t * hat_alloc(struct as *as) { - hat_t *hat; - htable_t *ht; /* top level htable */ - uint_t use_vlp; + hat_t *hat; + htable_t *ht; /* top level htable */ + uint_t use_vlp; + uint_t r; + hat_kernel_range_t *rp; + uintptr_t va; + uintptr_t eva; + uint_t start; + uint_t cnt; + htable_t *src; /* * Once we start creating user process HATs we can enable @@ -231,14 +224,21 @@ hat_alloc(struct as *as) mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); ASSERT(hat->hat_flags == 0); +#if defined(__xpv) /* - * a 32 bit process uses a VLP style hat when using PAE + * No VLP stuff on the hypervisor due to the 64-bit split top level + * page tables. On 32-bit it's not needed as the hypervisor takes + * care of copying the top level PTEs to a below 4Gig page. */ + use_vlp = 0; +#else /* __xpv */ + /* 32 bit processes uses a VLP style hat when running with PAE */ #if defined(__amd64) use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32); #elif defined(__i386) use_vlp = mmu.pae_hat; #endif +#endif /* __xpv */ if (use_vlp) { hat->hat_flags = HAT_VLP; bzero(hat->hat_vlp_ptes, VLP_SIZE); @@ -258,40 +258,65 @@ hat_alloc(struct as *as) /* * Initialize Kernel HAT entries at the top of the top level page - * table for the new hat. - * - * Note that we don't call htable_release() for the top level, that - * happens when the hat is destroyed in hat_free_end() + * tables for the new hat. */ hat->hat_htable = NULL; hat->hat_ht_cached = NULL; + XPV_DISALLOW_MIGRATE(); ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); + hat->hat_htable = ht; - if (!(hat->hat_flags & HAT_VLP)) - x86pte_copy(kas.a_hat->hat_htable, ht, khat_start, - khat_entries); -#if defined(__i386) - else if (khat_entries > 0) - bcopy(vlp_page + khat_start, hat->hat_vlp_ptes + khat_start, - khat_entries * sizeof (x86pte_t)); +#if defined(__amd64) + if (hat->hat_flags & HAT_VLP) + goto init_done; #endif - hat->hat_htable = ht; -#if defined(__i386) + for (r = 0; r < num_kernel_ranges; ++r) { + rp = &kernel_ranges[r]; + for (va = rp->hkr_start_va; va != rp->hkr_end_va; + va += cnt * LEVEL_SIZE(rp->hkr_level)) { + + if (rp->hkr_level == TOP_LEVEL(hat)) + ht = hat->hat_htable; + else + ht = htable_create(hat, va, rp->hkr_level, + NULL); + + start = htable_va2entry(va, ht); + cnt = HTABLE_NUM_PTES(ht) - start; + eva = va + + ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level)); + if (rp->hkr_end_va != 0 && + (eva > rp->hkr_end_va || eva == 0)) + cnt = htable_va2entry(rp->hkr_end_va, ht) - + start; + +#if defined(__i386) && !defined(__xpv) + if (ht->ht_flags & HTABLE_VLP) { + bcopy(&vlp_page[start], + &hat->hat_vlp_ptes[start], + cnt * sizeof (x86pte_t)); + continue; + } +#endif + src = htable_lookup(kas.a_hat, va, rp->hkr_level); + ASSERT(src != NULL); + x86pte_copy(src, ht, start, cnt); + htable_release(src); + } + } + +init_done: + XPV_ALLOW_MIGRATE(); + +#if defined(__xpv) /* - * PAE32 HAT alignment is less restrictive than the others to keep - * the kernel from using too much VA. Because of this we may need - * one layer further down when kernelbase isn't 1Gig aligned. - * See hat_free_end() for the htable_release() that goes with this - * htable_create() + * Pin top level page tables after initializing them */ - if (khat_pae32_htable != NULL) { - ht = htable_create(hat, kernelbase, - khat_pae32_htable->ht_level, NULL); - x86pte_copy(khat_pae32_htable, ht, khat_pae32_start, - khat_pae32_entries); - ht->ht_valid_cnt = khat_pae32_entries; - } + xen_pin(hat->hat_htable->ht_pfn, mmu.max_level); +#if defined(__amd64) + xen_pin(hat->hat_user_ptable, mmu.max_level); +#endif #endif /* @@ -346,13 +371,8 @@ hat_free_start(hat_t *hat) void hat_free_end(hat_t *hat) { - int i; kmem_cache_t *cache; -#ifdef DEBUG - for (i = 0; i <= mmu.max_page_level; i++) - ASSERT(hat->hat_pages_mapped[i] == 0); -#endif ASSERT(hat->hat_flags & HAT_FREEING); /* @@ -375,6 +395,16 @@ hat_free_end(hat_t *hat) mutex_exit(&hat_list_lock); hat->hat_next = hat->hat_prev = NULL; +#if defined(__xpv) + /* + * On the hypervisor, unpin top level page table(s) + */ + xen_unpin(hat->hat_htable->ht_pfn); +#if defined(__amd64) + xen_unpin(hat->hat_user_ptable); +#endif +#endif + /* * Make a pass through the htables freeing them all up. */ @@ -535,6 +565,9 @@ mmu_init(void) for (i = 0; i <= mmu.max_page_level; ++i) { mmu.pte_bits[i] = PT_VALID; +#if defined(__xpv) && defined(__amd64) + mmu.pte_bits[i] |= PT_USER; +#endif if (i > 0) mmu.pte_bits[i] |= PT_PAGESIZE; } @@ -674,7 +707,7 @@ hat_init() static void hat_vlp_setup(struct cpu *cpu) { -#if defined(__amd64) +#if defined(__amd64) && !defined(__xpv) struct hat_cpu_info *hci = cpu->cpu_hat_info; pfn_t pfn; @@ -693,20 +726,19 @@ hat_vlp_setup(struct cpu *cpu) hci->hci_vlp_pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l3ptes); ASSERT(hci->hci_vlp_pfn != PFN_INVALID); - bcopy(vlp_page + khat_start, hci->hci_vlp_l3ptes + khat_start, - khat_entries * sizeof (x86pte_t)); + bcopy(vlp_page, hci->hci_vlp_l3ptes, MMU_PAGESIZE); pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l2ptes); ASSERT(pfn != PFN_INVALID); hci->hci_vlp_l3ptes[0] = MAKEPTP(pfn, 2); -#endif /* __amd64 */ +#endif /* __amd64 && !__xpv */ } /*ARGSUSED*/ static void hat_vlp_teardown(cpu_t *cpu) { -#if defined(__amd64) +#if defined(__amd64) && !defined(__xpv) struct hat_cpu_info *hci; if ((hci = cpu->cpu_hat_info) == NULL) @@ -715,7 +747,14 @@ hat_vlp_teardown(cpu_t *cpu) kmem_free(hci->hci_vlp_l2ptes, MMU_PAGESIZE); if (hci->hci_vlp_l3ptes) kmem_free(hci->hci_vlp_l3ptes, MMU_PAGESIZE); -#endif /* __amd64 */ +#endif +} + +#define NEXT_HKR(r, l, s, e) { \ + kernel_ranges[r].hkr_level = l; \ + kernel_ranges[r].hkr_start_va = s; \ + kernel_ranges[r].hkr_end_va = e; \ + ++r; \ } /* @@ -729,90 +768,91 @@ hat_vlp_teardown(cpu_t *cpu) void hat_init_finish(void) { - htable_t *top = kas.a_hat->hat_htable; - htable_t *ht; - uint_t e; - x86pte_t pte; - uintptr_t va = kernelbase; size_t size; + uint_t r = 0; + uintptr_t va; + hat_kernel_range_t *rp; -#if defined(__i386) - ASSERT((va & LEVEL_MASK(1)) == va); - /* - * Deal with kernelbase not 1Gig aligned for 32 bit PAE hats. + * We are now effectively running on the kernel hat. + * Clearing use_boot_reserve shuts off using the pre-allocated boot + * reserve for all HAT allocations. From here on, the reserves are + * only used when avoiding recursion in kmem_alloc(). */ - if (!mmu.pae_hat || (va & LEVEL_OFFSET(mmu.max_level)) == 0) { - khat_pae32_htable = NULL; - } else { - ASSERT(mmu.max_level == 2); - ASSERT((va & LEVEL_OFFSET(mmu.max_level - 1)) == 0); - khat_pae32_htable = - htable_create(kas.a_hat, va, mmu.max_level - 1, NULL); - khat_pae32_start = htable_va2entry(va, khat_pae32_htable); - khat_pae32_entries = mmu.ptes_per_table - khat_pae32_start; - for (e = khat_pae32_start; e < mmu.ptes_per_table; - ++e, va += LEVEL_SIZE(mmu.max_level - 1)) { - pte = x86pte_get(khat_pae32_htable, e); - if (PTE_ISVALID(pte)) - continue; - ht = htable_create(kas.a_hat, va, mmu.max_level - 2, - NULL); - ASSERT(ht != NULL); - } - } -#endif + use_boot_reserve = 0; + htable_adjust_reserve(); /* - * The kernel hat will need fixed values in the highest level - * ptable for copying to all other hat's. This implies - * alignment restrictions on _userlimit. - * - * Note we don't htable_release() these htables. This keeps them - * from ever being stolen or free'd. - * - * top_level_count is used instead of ptes_per_table, since - * on 32-bit PAE we only have 4 usable entries at the top level ptable. + * User HATs are initialized with copies of all kernel mappings in + * higher level page tables. Ensure that those entries exist. */ - if (va == 0) - khat_start = mmu.top_level_count; - else - khat_start = htable_va2entry(va, kas.a_hat->hat_htable); - khat_entries = mmu.top_level_count - khat_start; - for (e = khat_start; e < mmu.top_level_count; - ++e, va += LEVEL_SIZE(mmu.max_level)) { - if (IN_HYPERVISOR_VA(va)) - continue; - pte = x86pte_get(top, e); - if (PTE_ISVALID(pte)) - continue; - ht = htable_create(kas.a_hat, va, mmu.max_level - 1, NULL); - ASSERT(ht != NULL); - } +#if defined(__amd64) + + NEXT_HKR(r, 3, kernelbase, 0); +#if defined(__xpv) + NEXT_HKR(r, 3, HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_END); +#endif + +#elif defined(__i386) + +#if !defined(__xpv) + if (mmu.pae_hat) { + va = kernelbase; + if ((va & LEVEL_MASK(2)) != va) { + va = P2ROUNDUP(va, LEVEL_SIZE(2)); + NEXT_HKR(r, 1, kernelbase, va); + } + if (va != 0) + NEXT_HKR(r, 2, va, 0); + } else +#endif /* __xpv */ + NEXT_HKR(r, 1, kernelbase, 0); + +#endif /* __i386 */ + + num_kernel_ranges = r; /* - * We are now effectively running on the kernel hat. - * Clearing use_boot_reserve shuts off using the pre-allocated boot - * reserve for all HAT allocations. From here on, the reserves are - * only used when mapping in memory for the hat's own allocations. + * Create all the kernel pagetables that will have entries + * shared to user HATs. */ - use_boot_reserve = 0; - htable_adjust_reserve(); + for (r = 0; r < num_kernel_ranges; ++r) { + rp = &kernel_ranges[r]; + for (va = rp->hkr_start_va; va != rp->hkr_end_va; + va += LEVEL_SIZE(rp->hkr_level)) { + htable_t *ht; + + if (IN_HYPERVISOR_VA(va)) + continue; + + /* can/must skip if a page mapping already exists */ + if (rp->hkr_level <= mmu.max_page_level && + (ht = htable_getpage(kas.a_hat, va, NULL)) != + NULL) { + htable_release(ht); + continue; + } + + (void) htable_create(kas.a_hat, va, rp->hkr_level - 1, + NULL); + } + } /* - * 32 bit kernels use only 4 of the 512 entries in its top level - * pagetable. We'll use the remainder for the "per CPU" page tables - * for VLP processes. - * - * We also map the top level kernel pagetable into the kernel to make - * it easy to use bcopy to initialize new address spaces. + * 32 bit PAE metal kernels use only 4 of the 512 entries in the + * page holding the top level pagetable. We use the remainder for + * the "per CPU" page tables for VLP processes. + * Map the top level kernel pagetable into the kernel to make + * it easy to use bcopy access these tables. */ if (mmu.pae_hat) { vlp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); hat_devload(kas.a_hat, (caddr_t)vlp_page, MMU_PAGESIZE, kas.a_hat->hat_htable->ht_pfn, +#if !defined(__xpv) PROT_WRITE | +#endif PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, HAT_LOAD | HAT_LOAD_NOCONSIST); } @@ -865,11 +905,14 @@ reload_pae32(hat_t *hat, cpu_t *cpu) /* * Switch to a new active hat, maintaining bit masks to track active CPUs. + * + * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it + * remains a 32-bit value. */ void hat_switch(hat_t *hat) { - uintptr_t newcr3; + uint64_t newcr3; cpu_t *cpu = CPU; hat_t *old = cpu->cpu_current_hat; @@ -906,9 +949,37 @@ hat_switch(hat_t *hat) (cpu->cpu_id + 1) * VLP_SIZE; #endif } else { - newcr3 = MAKECR3(hat->hat_htable->ht_pfn); + newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn); } +#ifdef __xpv + { + struct mmuext_op t[2]; + uint_t retcnt; + uint_t opcnt = 1; + + t[0].cmd = MMUEXT_NEW_BASEPTR; + t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); +#if defined(__amd64) + /* + * There's an interesting problem here, as to what to + * actually specify when switching to the kernel hat. + * For now we'll reuse the kernel hat again. + */ + t[1].cmd = MMUEXT_NEW_USER_BASEPTR; + if (hat == kas.a_hat) + t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); + else + t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); + ++opcnt; +#endif /* __amd64 */ + if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) + panic("HYPERVISOR_mmu_update() failed"); + ASSERT(retcnt == opcnt); + + } +#else setcr3(newcr3); +#endif ASSERT(cpu == CPU); } @@ -1003,6 +1074,7 @@ hat_swapout(hat_t *hat) htable_t *ht = NULL; level_t l; + XPV_DISALLOW_MIGRATE(); /* * We can't just call hat_unload(hat, 0, _userlimit...) here, because * seg_spt and shared pagetables can't be swapped out. @@ -1061,6 +1133,7 @@ hat_swapout(hat_t *hat) * go back and flush all the htables off the cached list. */ htable_purge_hat(hat); + XPV_ALLOW_MIGRATE(); } /* @@ -1138,11 +1211,11 @@ hati_sync_pte_to_page(page_t *pp, x86pte_t pte, level_t level) /* * This the set of PTE bits for PFN, permissions and caching - * that require a TLB flush (hat_tlb_inval) if changed on a HAT_LOAD_REMAP + * that are allowed to change on a HAT_LOAD_REMAP */ #define PT_REMAP_BITS \ (PT_PADDR | PT_NX | PT_WRITABLE | PT_WRITETHRU | \ - PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE) + PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE | PT_IGNORE | PT_REF | PT_MOD) #define REMAPASSERT(EX) if (!(EX)) panic("hati_pte_map: " #EX) /* @@ -1239,11 +1312,11 @@ hati_pte_map( } /* - * We only let remaps change the bits for PFNs, permissions - * or caching type. + * We only let remaps change the certain bits in the PTE. */ - ASSERT(PTE_GET(old_pte, ~(PT_REMAP_BITS | PT_REF | PT_MOD)) == - PTE_GET(pte, ~PT_REMAP_BITS)); + if (PTE_GET(old_pte, ~PT_REMAP_BITS) != PTE_GET(pte, ~PT_REMAP_BITS)) + panic("remap bits changed: old_pte="FMT_PTE", pte="FMT_PTE"\n", + old_pte, pte); /* * We don't create any mapping list entries on a remap, so release @@ -1429,6 +1502,7 @@ hat_memload( level_t level = 0; pfn_t pfn = page_pptonum(pp); + XPV_DISALLOW_MIGRATE(); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || va < _userlimit); ASSERT(hat == kas.a_hat || @@ -1444,6 +1518,7 @@ hat_memload( if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) { ASSERT(hat == kas.a_hat); hat_kmap_load(addr, pp, attr, flags); + XPV_ALLOW_MIGRATE(); return; } @@ -1454,6 +1529,7 @@ hat_memload( attr |= HAT_STORECACHING_OK; if (hati_load_common(hat, va, pp, attr, flags, level, pfn) != 0) panic("unexpected hati_load_common() failure"); + XPV_ALLOW_MIGRATE(); } /* ARGSUSED */ @@ -1484,6 +1560,7 @@ hat_memload_array( pfn_t pfn; pgcnt_t i; + XPV_DISALLOW_MIGRATE(); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || va + len <= _userlimit); ASSERT(hat == kas.a_hat || @@ -1555,6 +1632,7 @@ hat_memload_array( va += pgsize; pgindx += mmu_btop(pgsize); } + XPV_ALLOW_MIGRATE(); } /* ARGSUSED */ @@ -1613,6 +1691,7 @@ hat_devload( int f; /* per PTE copy of flags - maybe modified */ uint_t a; /* per PTE copy of attr */ + XPV_DISALLOW_MIGRATE(); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || eva <= _userlimit); ASSERT(hat == kas.a_hat || @@ -1645,18 +1724,15 @@ hat_devload( */ a = attr; f = flags; - if (pf_is_memory(pfn)) { - if (!(a & HAT_PLAT_NOCACHE)) - a |= HAT_STORECACHING_OK; + if (!pf_is_memory(pfn)) + f |= HAT_LOAD_NOCONSIST; + else if (!(a & HAT_PLAT_NOCACHE)) + a |= HAT_STORECACHING_OK; - if (f & HAT_LOAD_NOCONSIST) - pp = NULL; - else - pp = page_numtopp_nolock(pfn); - } else { + if (f & HAT_LOAD_NOCONSIST) pp = NULL; - f |= HAT_LOAD_NOCONSIST; - } + else + pp = page_numtopp_nolock(pfn); /* * load this page mapping @@ -1675,6 +1751,7 @@ hat_devload( va += pgsize; pfn += mmu_btop(pgsize); } + XPV_ALLOW_MIGRATE(); } /* @@ -1701,6 +1778,7 @@ hat_unlock(hat_t *hat, caddr_t addr, size_t len) if (eaddr > _userlimit) panic("hat_unlock() address out of range - above _userlimit"); + XPV_DISALLOW_MIGRATE(); ASSERT(AS_LOCK_HELD(hat->hat_as, &hat->hat_as->a_lock)); while (vaddr < eaddr) { (void) htable_walk(hat, &ht, &vaddr, eaddr); @@ -1718,6 +1796,7 @@ hat_unlock(hat_t *hat, caddr_t addr, size_t len) } if (ht) htable_release(ht); + XPV_ALLOW_MIGRATE(); } /* ARGSUSED */ @@ -1728,6 +1807,7 @@ hat_unlock_region(struct hat *hat, caddr_t addr, size_t len, panic("No shared region support on x86"); } +#if !defined(__xpv) /* * Cross call service routine to demap a virtual page on * the current CPU or flush all mappings in TLB. @@ -1851,6 +1931,7 @@ tlb_service(void) if (flags & PS_IE) sti(); } +#endif /* !__xpv */ /* * Internal routine to do cross calls to invalidate a range of pages on @@ -1861,10 +1942,12 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) { extern int flushes_require_xcalls; /* from mp_startup.c */ cpuset_t justme; - cpuset_t check_cpus; cpuset_t cpus_to_shootdown; +#ifndef __xpv + cpuset_t check_cpus; cpu_t *cpup; int c; +#endif /* * If the hat is being destroyed, there are no more users, so @@ -1887,7 +1970,14 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) * if not running with multiple CPUs, don't use cross calls */ if (panicstr || !flushes_require_xcalls) { +#ifdef __xpv + if (va == DEMAP_ALL_ADDR) + xen_flush_tlb(); + else + xen_flush_va((caddr_t)va); +#else (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)va, NULL); +#endif return; } @@ -1903,6 +1993,7 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) else cpus_to_shootdown = hat->hat_cpus; +#ifndef __xpv /* * If any CPUs in the set are idle, just request a delayed flush * and avoid waking them up. @@ -1930,17 +2021,32 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) CPUSET_DEL(cpus_to_shootdown, c); } } +#endif if (CPUSET_ISNULL(cpus_to_shootdown) || CPUSET_ISEQUAL(cpus_to_shootdown, justme)) { +#ifdef __xpv + if (va == DEMAP_ALL_ADDR) + xen_flush_tlb(); + else + xen_flush_va((caddr_t)va); +#else (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)va, NULL); +#endif } else { CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id); +#ifdef __xpv + if (va == DEMAP_ALL_ADDR) + xen_gflush_tlb(cpus_to_shootdown); + else + xen_gflush_va((caddr_t)va, cpus_to_shootdown); +#else xc_call((xc_arg_t)hat, (xc_arg_t)va, NULL, X_CALL_HIPRI, cpus_to_shootdown, hati_demap_func); +#endif } kpreempt_enable(); @@ -1985,6 +2091,10 @@ hat_pte_unmap( if (PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST) { pp = NULL; } else { +#ifdef __xpv + if (pfn == PFN_INVALID) + panic("Invalid PFN, but not PT_NOCONSIST"); +#endif pp = page_numtopp_nolock(pfn); if (pp == NULL) { panic("no page_t, not NOCONSIST: old_pte=" @@ -2000,10 +2110,16 @@ hat_pte_unmap( * hasn't changed, as the mappings are no longer in use by * any thread, invalidation is unnecessary. * If not freeing, do a full invalidate. + * + * On the hypervisor we must always remove mappings, as a + * writable mapping left behind could cause a page table + * allocation to fail. */ +#if !defined(__xpv) if (hat->hat_flags & HAT_FREEING) old_pte = x86pte_get(ht, entry); else +#endif old_pte = x86pte_inval(ht, entry, old_pte, pte_ptr); /* @@ -2098,6 +2214,7 @@ hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags) { uintptr_t va = (uintptr_t)addr; + XPV_DISALLOW_MIGRATE(); ASSERT(hat == kas.a_hat || va + len <= _userlimit); /* @@ -2109,6 +2226,7 @@ hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags) } else { hat_unload_callback(hat, addr, len, flags, NULL); } + XPV_ALLOW_MIGRATE(); } /* @@ -2164,6 +2282,7 @@ hat_unload_callback( uint_t r_cnt = 0; x86pte_t old_pte; + XPV_DISALLOW_MIGRATE(); ASSERT(hat == kas.a_hat || eaddr <= _userlimit); ASSERT(IS_PAGEALIGNED(vaddr)); ASSERT(IS_PAGEALIGNED(eaddr)); @@ -2179,6 +2298,7 @@ hat_unload_callback( hat_pte_unmap(ht, entry, flags, old_pte, NULL); htable_release(ht); } + XPV_ALLOW_MIGRATE(); return; } @@ -2225,6 +2345,7 @@ hat_unload_callback( */ if (r_cnt > 0) handle_ranges(cb, r_cnt, r); + XPV_ALLOW_MIGRATE(); } /* @@ -2251,6 +2372,7 @@ hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags) ASSERT(IS_PAGEALIGNED(eaddr)); ASSERT(hat == kas.a_hat || eaddr <= _userlimit); + XPV_DISALLOW_MIGRATE(); for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) { try_again: pte = htable_walk(hat, &ht, &vaddr, eaddr); @@ -2304,6 +2426,7 @@ try_again: } if (ht) htable_release(ht); + XPV_ALLOW_MIGRATE(); } /* @@ -2373,6 +2496,7 @@ hat_updateattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr, int what) x86pte_t oldpte, newpte; page_t *pp; + XPV_DISALLOW_MIGRATE(); ASSERT(IS_PAGEALIGNED(vaddr)); ASSERT(IS_PAGEALIGNED(eaddr)); ASSERT(hat == kas.a_hat || @@ -2460,6 +2584,7 @@ try_again: } if (ht) htable_release(ht); + XPV_ALLOW_MIGRATE(); } /* @@ -2537,6 +2662,7 @@ hat_getpfnum(hat_t *hat, caddr_t addr) if (IN_VA_HOLE(vaddr)) return (PFN_INVALID); + XPV_DISALLOW_MIGRATE(); /* * A very common use of hat_getpfnum() is from the DDI for kernel pages. * Use the kmap_ptes (which also covers the 32 bit heap) to speed @@ -2548,21 +2674,25 @@ hat_getpfnum(hat_t *hat, caddr_t addr) pg_index = mmu_btop(vaddr - mmu.kmap_addr); pte = GET_PTE(PT_INDEX_PTR(mmu.kmap_ptes, pg_index)); - if (!PTE_ISVALID(pte)) - return (PFN_INVALID); - /*LINTED [use of constant 0 causes a silly lint warning] */ - return (PTE2PFN(pte, 0)); + if (PTE_ISVALID(pte)) + /*LINTED [use of constant 0 causes a lint warning] */ + pfn = PTE2PFN(pte, 0); + XPV_ALLOW_MIGRATE(); + return (pfn); } ht = htable_getpage(hat, vaddr, &entry); - if (ht == NULL) + if (ht == NULL) { + XPV_ALLOW_MIGRATE(); return (PFN_INVALID); + } ASSERT(vaddr >= ht->ht_vaddr); ASSERT(vaddr <= HTABLE_LAST_PAGE(ht)); pfn = PTE2PFN(x86pte_get(ht, entry), ht->ht_level); if (ht->ht_level > 0) pfn += mmu_btop(vaddr & LEVEL_OFFSET(ht->ht_level)); htable_release(ht); + XPV_ALLOW_MIGRATE(); return (pfn); } @@ -2590,7 +2720,7 @@ hat_getkpfnum(caddr_t addr) if ((uintptr_t)addr < kernelbase) return (PFN_INVALID); - + XPV_DISALLOW_MIGRATE(); if (segkpm && IS_KPM_ADDR(addr)) { badcaller = 1; pfn = hat_kpm_va2pfn(addr); @@ -2601,6 +2731,7 @@ hat_getkpfnum(caddr_t addr) if (badcaller) hat_getkpfnum_badcall(caller()); + XPV_ALLOW_MIGRATE(); return (pfn); } #endif /* __amd64 */ @@ -2638,10 +2769,8 @@ hat_probe(hat_t *hat, caddr_t addr) } ht = htable_getpage(hat, vaddr, &entry); - if (ht == NULL) - return (0); htable_release(ht); - return (1); + return (ht != NULL); } /* @@ -2708,6 +2837,7 @@ hat_share( ASSERT(hat_get_mapped_size(ism_hat) == 0); return (0); } + XPV_DISALLOW_MIGRATE(); /* * The SPT segment driver often passes us a size larger than there are @@ -2857,6 +2987,7 @@ not_shared: } if (ism_ht != NULL) htable_release(ism_ht); + XPV_ALLOW_MIGRATE(); return (0); } @@ -2881,6 +3012,7 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) ASSERT(eaddr <= _userlimit); ASSERT(IS_PAGEALIGNED(vaddr)); ASSERT(IS_PAGEALIGNED(eaddr)); + XPV_DISALLOW_MIGRATE(); /* * First go through and remove any shared pagetables. @@ -2930,6 +3062,7 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) if (!is_it_dism(hat, addr)) flags |= HAT_UNLOAD_UNLOCK; hat_unload(hat, addr, len, flags); + XPV_ALLOW_MIGRATE(); } @@ -2957,6 +3090,7 @@ hati_page_clrwrt(struct page *pp) x86pte_t new; uint_t pszc = 0; + XPV_DISALLOW_MIGRATE(); next_size: /* * walk thru the mapping list clearing write permission @@ -2998,6 +3132,7 @@ next_size: goto next_size; } } + XPV_ALLOW_MIGRATE(); } /* @@ -3161,6 +3296,7 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) uint_t entry; level_t level; + XPV_DISALLOW_MIGRATE(); #if defined(__amd64) /* * clear the vpm ref. @@ -3188,6 +3324,7 @@ next_size: * If not part of a larger page, we're done. */ if (cur_pp->p_szc <= pg_szcd) { + XPV_ALLOW_MIGRATE(); return (0); } @@ -3447,6 +3584,7 @@ hat_pagesync(struct page *pp, uint_t flags) } } + XPV_DISALLOW_MIGRATE(); next_size: /* * walk thru the mapping list syncing (and clearing) ref/mod bits. @@ -3506,6 +3644,7 @@ try_again: } } done: + XPV_ALLOW_MIGRATE(); return (save_pp->p_nrm & nrmbits); } @@ -3587,7 +3726,9 @@ void hat_thread_exit(kthread_t *thd) { ASSERT(thd->t_procp->p_as == &kas); + XPV_DISALLOW_MIGRATE(); hat_switch(thd->t_procp->p_as->a_hat); + XPV_ALLOW_MIGRATE(); } /* @@ -3597,11 +3738,13 @@ hat_thread_exit(kthread_t *thd) void hat_setup(hat_t *hat, int flags) { + XPV_DISALLOW_MIGRATE(); kpreempt_disable(); hat_switch(hat); kpreempt_enable(); + XPV_ALLOW_MIGRATE(); } /* @@ -3664,6 +3807,11 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) /* * invalidate any left over mapping and decrement the htable valid count */ +#ifdef __xpv + if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0, + UVMF_INVLPG | UVMF_LOCAL)) + panic("HYPERVISOR_update_va_mapping() failed"); +#else { x86pte_t *pteptr; @@ -3676,6 +3824,7 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) mmu_tlbflush_entry(addr); x86pte_mapout(); } +#endif ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0); if (ht == NULL) @@ -3717,7 +3866,12 @@ hat_mempte_remap( ASSERT(ht->ht_pfn == mmu_btop(pte_pa)); htable_release(ht); #endif + XPV_DISALLOW_MIGRATE(); pte = hati_mkpte(pfn, attr, 0, flags); +#ifdef __xpv + if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL)) + panic("HYPERVISOR_update_va_mapping() failed"); +#else { x86pte_t *pteptr; @@ -3730,6 +3884,8 @@ hat_mempte_remap( mmu_tlbflush_entry(addr); x86pte_mapout(); } +#endif + XPV_ALLOW_MIGRATE(); } @@ -4052,3 +4208,30 @@ hat_kpm_mseghash_clear(int nentries) void hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) {} + +#ifdef __xpv +/* + * There are specific Hypervisor calls to establish and remove mappings + * to grant table references and the privcmd driver. We have to ensure + * that a page table actually exists. + */ +void +hat_prepare_mapping(hat_t *hat, caddr_t addr) +{ + ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); + (void) htable_create(hat, (uintptr_t)addr, 0, NULL); +} + +void +hat_release_mapping(hat_t *hat, caddr_t addr) +{ + htable_t *ht; + + ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); + ht = htable_lookup(hat, (uintptr_t)addr, 0); + ASSERT(ht != NULL); + ASSERT(ht->ht_busy >= 2); + htable_release(ht); + htable_release(ht); +} +#endif |