usr/src/uts/i86pc/vm/i86_mmu.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright 2018 Joyent, Inc.
 */

#include <sys/t_lock.h>
#include <sys/memlist.h>
#include <sys/cpuvar.h>
#include <sys/vmem.h>
#include <sys/mman.h>
#include <sys/vm.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vm_machparam.h>
#include <sys/tss.h>
#include <sys/vnode.h>
#include <vm/hat.h>
#include <vm/anon.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_map.h>
#include <vm/hat_i86.h>
#include <sys/promif.h>
#include <sys/x86_archext.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
#include <sys/sunddi.h>
#include <sys/ddidmareq.h>
#include <sys/controlregs.h>
#include <sys/reboot.h>
#include <sys/kdi.h>
#include <sys/bootconf.h>
#include <sys/bootsvcs.h>
#include <sys/bootinfo.h>
#include <vm/kboot_mmu.h>

#ifdef __xpv
#include <sys/hypervisor.h>
#endif

#define	ON_USER_HAT(cpu) \
	((cpu)->cpu_m.mcpu_current_hat != NULL && \
	(cpu)->cpu_m.mcpu_current_hat != kas.a_hat)

/*
 * Flag is not set early in boot. Once it is set we are no longer
 * using boot's page tables.
 */
uint_t khat_running = 0;

/*
 * This procedure is callable only while the boot loader is in charge of the
 * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
 * kboot_mmu.c since it's used from common code.
 */
pfn_t
va_to_pfn(void *vaddr)
{
	uintptr_t	des_va = ALIGN2PAGE(vaddr);
	uintptr_t	va = des_va;
	size_t		len;
	uint_t		prot;
	pfn_t		pfn;

	if (khat_running)
		panic("va_to_pfn(): called too late\n");

	if (kbm_probe(&va, &len, &pfn, &prot) == 0)
		return (PFN_INVALID);
	if (va > des_va)
		return (PFN_INVALID);
	if (va < des_va)
		pfn += mmu_btop(des_va - va);
	return (pfn);
}

/*
 * Initialize a special area in the kernel that always holds some PTEs for
 * faster performance. This always holds segmap's PTEs.
 * In the 32 bit kernel this maps the kernel heap too.
 */
void
hat_kmap_init(uintptr_t base, size_t len)
{
	uintptr_t map_addr;	/* base rounded down to large page size */
	uintptr_t map_eaddr;	/* base + len rounded up */
	size_t map_len;
	caddr_t ptes;		/* mapping area in kernel for kmap ptes */
	size_t window_size;	/* size of mapping area for ptes */
	ulong_t htable_cnt;	/* # of page tables to cover map_len */
	ulong_t i;
	htable_t *ht;
	uintptr_t va;

	/*
	 * We have to map in an area that matches an entire page table.
	 * The PTEs are large page aligned to avoid spurious pagefaults
	 * on the hypervisor.
	 */
	map_addr = base & LEVEL_MASK(1);
	map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
	map_len = map_eaddr - map_addr;
	window_size = mmu_btop(map_len) * mmu.pte_size;
	window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
	htable_cnt = map_len >> LEVEL_SHIFT(1);

	/*
	 * allocate vmem for the kmap_ptes
	 */
	ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
	    0, NULL, NULL, VM_SLEEP);
	mmu.kmap_htables =
	    kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);

	/*
	 * Map the page tables that cover kmap into the allocated range.
	 * Note we don't ever htable_release() the kmap page tables - they
	 * can't ever be stolen, freed, etc.
	 */
	for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
		ht = htable_create(kas.a_hat, va, 0, NULL);
		if (ht == NULL)
			panic("hat_kmap_init: ht == NULL");
		mmu.kmap_htables[i] = ht;

		hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
		    MMU_PAGESIZE, ht->ht_pfn,
#ifdef __xpv
		    PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
#else
		    PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
#endif
		    HAT_LOAD | HAT_LOAD_NOCONSIST);
	}

	/*
	 * set information in mmu to activate handling of kmap
	 */
	mmu.kmap_addr = map_addr;
	mmu.kmap_eaddr = map_eaddr;
	mmu.kmap_ptes = (x86pte_t *)ptes;
}

extern caddr_t	kpm_vbase;
extern size_t	kpm_size;

#ifdef __xpv
/*
 * Create the initial segkpm mappings for the hypervisor. To avoid having
 * to deal with page tables being read only, we make all mappings
 * read only at first.
 */
static void
xen_kpm_create(paddr_t paddr, level_t lvl)
{
	ulong_t pg_off;

	for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
		kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
		kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
		    paddr + pg_off);
	}
}

/*
 * Try to make all kpm mappings writable. Failures are ok, as those
 * are just pagetable, GDT, etc. pages.
 */
static void
xen_kpm_finish_init(void)
{
	pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
	pfn_t pfn;
	page_t *pp;

	for (pfn = 0; pfn < mfn_count; ++pfn) {
		/*
		 * skip gdt
		 */
		if (pfn == gdtpfn)
			continue;

		/*
		 * p_index is a hint that this is a pagetable
		 */
		pp = page_numtopp_nolock(pfn);
		if (pp && pp->p_index) {
			pp->p_index = 0;
			continue;
		}
		(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
	}
}
#endif

/*
 * Routine to pre-allocate data structures for hat_kern_setup(). It computes
 * how many pagetables it needs by walking the boot loader's page tables.
 */
/*ARGSUSED*/
void
hat_kern_alloc(
	caddr_t	segmap_base,
	size_t	segmap_size,
	caddr_t	ekernelheap)
{
	uintptr_t	last_va = (uintptr_t)-1;	/* catch 1st time */
	uintptr_t	va = 0;
	size_t		size;
	pfn_t		pfn;
	uint_t		prot;
	uint_t		table_cnt = 1;
	uint_t		mapping_cnt;
	level_t		start_level;
	level_t		l;
	struct memlist	*pmem;
	level_t		lpagel = mmu.max_page_level;
	uint64_t	paddr;
	int64_t		psize;
	int		nwindows;

	if (kpm_size > 0) {
		/*
		 * Create the kpm page tables.  When running on the
		 * hypervisor these are made read/only at first.
		 * Later we'll add write permission where possible.
		 */
		for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
			paddr = pmem->ml_address;
			psize = pmem->ml_size;
			while (psize >= MMU_PAGESIZE) {
				/* find the largest page size */
				for (l = lpagel; l > 0; l--) {
					if ((paddr & LEVEL_OFFSET(l)) == 0 &&
					    psize > LEVEL_SIZE(l))
						break;
				}

#if defined(__xpv)
				/*
				 * Create read/only mappings to avoid
				 * conflicting with pagetable usage
				 */
				xen_kpm_create(paddr, l);
#else
				kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
				    l, 1);
#endif
				paddr += LEVEL_SIZE(l);
				psize -= LEVEL_SIZE(l);
			}
		}
	}

	/*
	 * If this machine doesn't have a kpm segment, we need to allocate
	 * a small number of 'windows' which can be used to map pagetables.
	 */
	nwindows = (kpm_size == 0) ? 2 * NCPU : 0;

#if defined(__xpv)
	/*
	 * On a hypervisor, these windows are also used by the xpv_panic
	 * code, where we need one window for each level of the pagetable
	 * hierarchy.
	 */
	nwindows = MAX(nwindows, mmu.max_level);
#endif

	if (nwindows != 0) {
		/*
		 * Create the page windows and 1 page of VA in
		 * which we map the PTEs of those windows.
		 */
		mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
		    LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
		ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
		mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
		    MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);

		/*
		 * Find/Create the page table window mappings.
		 */
		paddr = 0;
		(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
		ASSERT(paddr != 0);
		ASSERT((paddr & MMU_PAGEOFFSET) == 0);
		mmu.pwin_pte_pa = paddr;
#ifdef __xpv
		(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
		kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
#else
		kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
#endif
	}

	/*
	 * Walk the boot loader's page tables and figure out
	 * how many tables and page mappings there will be.
	 */
	while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
		/*
		 * At each level, if the last_va falls into a new htable,
		 * increment table_cnt. We can stop at the 1st level where
		 * they are in the same htable.
		 */
		start_level = 0;
		while (start_level <= mmu.max_page_level) {
			if (size == LEVEL_SIZE(start_level))
				break;
			start_level++;
		}

		for (l = start_level; l < mmu.max_level; ++l) {
			if (va >> LEVEL_SHIFT(l + 1) ==
			    last_va >> LEVEL_SHIFT(l + 1))
				break;
			++table_cnt;
		}
		last_va = va;
		l = (start_level == 0) ? 1 : start_level;
		va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
	}

	/*
	 * Besides the boot loader mappings, we're going to fill in
	 * the entire top level page table for the kernel. Make sure there's
	 * enough reserve for that too.
	 */
	table_cnt += mmu.top_level_count - ((kernelbase >>
	    LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));

	/*
	 * Add 1/4 more into table_cnt for extra slop.  The unused
	 * slop is freed back when we htable_adjust_reserve() later.
	 */
	table_cnt += table_cnt >> 2;

	/*
	 * We only need mapping entries (hments) for shared pages.
	 * This should be far, far fewer than the total possible,
	 * We'll allocate enough for 1/16 of all possible PTEs.
	 */
	mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;

	/*
	 * Now create the initial htable/hment reserves
	 */
	htable_initial_reserve(table_cnt);
	hment_reserve(mapping_cnt);
	x86pte_cpu_init(CPU);
}


/*
 * This routine handles the work of creating the kernel's initial mappings
 * by deciphering the mappings in the page tables created by the boot program.
 *
 * We maintain large page mappings, but only to a level 1 pagesize.
 * The boot loader can only add new mappings once this function starts.
 * In particular it can not change the pagesize used for any existing
 * mappings or this code breaks!
 */

void
hat_kern_setup(void)
{
	/*
	 * Attach htables to the existing pagetables
	 */
	/* BEGIN CSTYLED */
	htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
#ifdef __xpv
	    mmu_btop(xen_info->pt_base - ONE_GIG));
#else
	    mmu_btop(getcr3_pa()));
#endif
	/* END CSTYLED */

#if defined(__xpv)
	/*
	 * Try to make the kpm mappings r/w. Failures here are OK, as
	 * it's probably just a pagetable
	 */
	xen_kpm_finish_init();
#endif

	/*
	 * The kernel HAT is now officially open for business.
	 */
	khat_running = 1;

	CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
	CPU->cpu_current_hat = kas.a_hat;
}

#ifndef __xpv

/*
 * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
 * INVPCID_ADDR isn't.
 */
static void
invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
{
	ulong_t	flag;
	uint64_t cr4;

	if (x86_use_invpcid == 1) {
		ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
		invpcid_insn(type, pcid, addr);
		return;
	}

	switch (type) {
	case INVPCID_ALL_GLOBAL:
		flag = intr_clear();
		cr4 = getcr4();
		setcr4(cr4 & ~(ulong_t)CR4_PGE);
		setcr4(cr4 | CR4_PGE);
		intr_restore(flag);
		break;

	case INVPCID_ALL_NONGLOBAL:
		if (!(getcr4() & CR4_PCIDE)) {
			reload_cr3();
		} else {
			flag = intr_clear();
			cr4 = getcr4();
			setcr4(cr4 & ~(ulong_t)CR4_PGE);
			setcr4(cr4 | CR4_PGE);
			intr_restore(flag);
		}
		break;

	case INVPCID_ADDR:
		if (pcid == PCID_USER) {
			flag = intr_clear();
			ASSERT(addr < kernelbase);
			ASSERT(ON_USER_HAT(CPU));
			ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
			tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
			    MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
			intr_restore(flag);
		} else {
			mmu_invlpg((caddr_t)addr);
		}
		break;

	default:
		panic("unsupported invpcid(%lu)", type);
		break;
	}
}

/*
 * Flush one kernel mapping.
 *
 * We want to assert on kernel space here mainly for reasoning about the PCIDE
 * case: namely, this flush should never need to flush a non-current PCID
 * mapping.  This presumes we never have reason to flush the kernel regions
 * available to PCID_USER (the trampolines and so on).  It also relies on
 * PCID_KERNEL == PCID_NONE.
 */
void
mmu_flush_tlb_kpage(uintptr_t va)
{
	ASSERT(va >= kernelbase);
	ASSERT(getpcid() == PCID_KERNEL);
	mmu_invlpg((caddr_t)va);
}

/*
 * Flush one mapping: local CPU version of hat_tlb_inval().
 *
 * If this is a userspace address in the PCIDE case, we need two invalidations,
 * one for any potentially stale PCID_USER mapping, as well as any established
 * while in the kernel.
 */
void
mmu_flush_tlb_page(uintptr_t va)
{
	ASSERT(getpcid() == PCID_KERNEL);

	if (va >= kernelbase) {
		mmu_flush_tlb_kpage(va);
		return;
	}

	if (!(getcr4() & CR4_PCIDE)) {
		mmu_invlpg((caddr_t)va);
		return;
	}

	/*
	 * Yes, kas will need to flush below kernelspace, at least during boot.
	 * But there's no PCID_USER context.
	 */
	if (ON_USER_HAT(CPU))
		invpcid(INVPCID_ADDR, PCID_USER, va);
	invpcid(INVPCID_ADDR, PCID_KERNEL, va);
}

static void
mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
{
	EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
	ASSERT(len > 0);
	ASSERT(pgsz != 0);

	if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
		for (uintptr_t va = addr; va < (addr + len); va += pgsz)
			mmu_flush_tlb_page(va);
		return;
	}

	/*
	 * As an emulated invpcid() in the PCIDE case requires jumping
	 * cr3s, we batch the invalidations.  We should only need to flush the
	 * user range if we're on a user-space HAT.
	 */
	if (addr < kernelbase && ON_USER_HAT(CPU)) {
		ulong_t flag = intr_clear();
		ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
		tr_mmu_flush_user_range(addr, len, pgsz,
		    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
		intr_restore(flag);
	}

	for (uintptr_t va = addr; va < (addr + len); va += pgsz)
		mmu_invlpg((caddr_t)va);
}

/*
 * MMU TLB (and PT cache) flushing on this CPU.
 *
 * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
 * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
 * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
 * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
 * invalidated.
 */
void
mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
{
	ASSERT(getpcid() == PCID_KERNEL);

	switch (type) {
	case FLUSH_TLB_ALL:
		ASSERT(range == NULL);
		invpcid(INVPCID_ALL_GLOBAL, 0, 0);
		break;

	case FLUSH_TLB_NONGLOBAL:
		ASSERT(range == NULL);
		invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
		break;

	case FLUSH_TLB_RANGE: {
		mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
		    LEVEL_SIZE(range->tr_level));
		break;
	}

	default:
		panic("invalid call mmu_flush_tlb(%d)", type);
		break;
	}
}

#endif /* ! __xpv */