summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc/vm/vm_dep.h
blob: 252f61480efbcab08617843fdee6bc9000986c9d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * UNIX machine dependent virtual memory support.
 */

#ifndef	_VM_DEP_H
#define	_VM_DEP_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#ifdef	__cplusplus
extern "C" {
#endif

#include <sys/clock.h>
#include <vm/hat_pte.h>

/*
 * WARNING: vm_dep.h is included by files in common. As such, macros
 * dependent upon PTE36 such as LARGEPAGESIZE cannot be used in this file.
 */

#define	GETTICK()	tsc_read()

/* memranges in descending order */
extern pfn_t		*memranges;

#define	MEMRANGEHI(mtype)						\
	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
#define	MEMRANGELO(mtype)	(memranges[mtype])

/*
 * combined memory ranges from mnode and memranges[] to manage single
 * mnode/mtype dimension in the page lists.
 */
typedef struct {
	pfn_t	mnr_pfnlo;
	pfn_t	mnr_pfnhi;
	int	mnr_mnode;
	int	mnr_memrange;		/* index into memranges[] */
#ifdef DEBUG
	/* maintain page list stats */
	pgcnt_t	mnr_mt_pgmax;		/* mnode/mtype max page cnt */
	pgcnt_t	mnr_mt_pgcnt;		/* free cnt */
	pgcnt_t	mnr_mt_clpgcnt;		/* cache list free cnt */
	struct mnr_mts {		/* mnode/mtype szc stats */
		pgcnt_t	mnr_mts_pgcnt;
		int	mnr_mts_colors;
		pgcnt_t *mnr_mtsc_pgcnt;
	} 	*mnr_mts;
#endif
} mnoderange_t;

#ifdef DEBUG
#define	PLCNT_SZ(ctrs_sz) {						\
	int	szc, colors;						\
	ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) *		\
	    mmu_page_sizes;						\
	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
		colors = page_get_pagecolors(szc);			\
		ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;	\
	}								\
}

#define	PLCNT_INIT(addr) {						\
	int	mt, szc, colors;					\
	for (mt = 0; mt < mnoderangecnt; mt++) {			\
		mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;	\
		addr += (sizeof (struct mnr_mts) * mmu_page_sizes);	\
		for (szc = 0; szc < mmu_page_sizes; szc++) {		\
			colors = page_get_pagecolors(szc);		\
			mnoderanges[mt].mnr_mts[szc].mnr_mts_colors =	\
			    colors;					\
			mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =	\
			    (pgcnt_t *)addr;				\
			addr += (sizeof (pgcnt_t) * colors);		\
		}							\
	}								\
}
#define	PLCNT_DO(pp, mtype, szc, cnt, flags) {				\
	int	bin = PP_2_BIN(pp);					\
	if (flags & PG_LIST_ISINIT)					\
		mnoderanges[mtype].mnr_mt_pgmax += cnt;			\
	atomic_add_long(&mnoderanges[mtype].mnr_mt_pgcnt, cnt);		\
	if (flags & PG_CACHE_LIST)					\
		atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt,	\
		    cnt);						\
	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
	    mnr_mts_pgcnt, cnt);					\
	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
	    mnr_mtsc_pgcnt[bin], cnt);					\
}
#else
#define	PLCNT_SZ(ctrs_sz)
#define	PLCNT_INIT(base)
#define	PLCNT_DO(pp, mtype, szc, cnt, flags)
#endif

#define	PLCNT_INCR(pp, mnode, szc, flags) {				\
	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
	int	mtype = PP_2_MTYPE(pp);					\
	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
	if (physmax4g && mtype <= mtype4g)				\
		atomic_add_long(&freemem4g, cnt);			\
	if (flags & PG_LIST_ISINIT) {					\
		if (physmax4g && mtype <= mtype4g)			\
			maxmem4g += cnt;				\
	}								\
	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
}

#define	PLCNT_DECR(pp, mnode, szc, flags) {				\
	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
	int	mtype = PP_2_MTYPE(pp);					\
	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
	if (physmax4g && mtype <= mtype4g)				\
		atomic_add_long(&freemem4g, cnt);			\
	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
}

extern mnoderange_t	*mnoderanges;
extern int		mnoderangecnt;
extern int		mtype4g;

/*
 * 4g memory management variables for systems with more than 4g of memory:
 *
 * physical memory below 4g is required for 32bit dma devices and, currently,
 * for kmem memory. On systems with more than 4g of memory, the pool of memory
 * below 4g can be depleted without any paging activity given that there is
 * likely to be sufficient memory above 4g.
 *
 * physmax4g is set true if the largest pfn is over 4g. The rest of the
 * 4g memory management code is enabled only when physmax4g is true.
 *
 * maxmem4g is the count of the maximum number of pages on the page lists
 * with physical addresses below 4g. It can be a lot less then 4g given that
 * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 * agp aperture etc.
 *
 * freemem4g maintains the count of the number of available pages on the
 * page lists with physical addresses below 4g.
 *
 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 * 6% (desfree4gshift = 4) of maxmem4g.
 *
 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 * and the amount of physical memory above 4g is greater than freemem4g.
 * In this case, page_get_* routines will restrict below 4g allocations
 * for requests that don't specifically require it.
 */

extern int		physmax4g;
extern pgcnt_t		maxmem4g;
extern pgcnt_t		freemem4g;
extern int		lotsfree4gshift;
extern int		desfree4gshift;
#define	LOTSFREE4G	(maxmem4g >> lotsfree4gshift)
#define	DESFREE4G	(maxmem4g >> desfree4gshift)

#define	RESTRICT4G_ALLOC					\
	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))

extern int		restricted_kmemalloc;
extern int		memrange_num(pfn_t);
extern int		pfn_2_mtype(pfn_t);
extern int		mtype_func(int, int, uint_t);

#define	NUM_MEM_RANGES	4		/* memory range types */

/*
 * Per page size free lists. Allocated dynamically.
 * dimensions [mtype][mmu_page_sizes][colors]
 *
 * mtype specifies a physical memory range with a unique mnode.
 */

extern page_t ****page_freelists;

#define	PAGE_FREELISTS(mnode, szc, color, mtype)		\
	(*(page_freelists[mtype][szc] + (color)))

/*
 * For now there is only a single size cache list. Allocated dynamically.
 * dimensions [mtype][colors]
 *
 * mtype specifies a physical memory range with a unique mnode.
 */
extern page_t ***page_cachelists;

#define	PAGE_CACHELISTS(mnode, color, mtype) 		\
	(*(page_cachelists[mtype] + (color)))

/*
 * There are mutexes for both the page freelist
 * and the page cachelist.  We want enough locks to make contention
 * reasonable, but not too many -- otherwise page_freelist_lock() gets
 * so expensive that it becomes the bottleneck!
 */

#define	NPC_MUTEX	16

extern kmutex_t	*fpc_mutex[NPC_MUTEX];
extern kmutex_t	*cpc_mutex[NPC_MUTEX];

extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);

/* Find the bin for the given page if it was of size szc */
#define	PP_2_BIN_SZC(pp, szc)						\
	(((pp->p_pagenum) & page_colors_mask) >>			\
	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))

#define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))

#define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
#define	PP_2_MTYPE(pp)		(pfn_2_mtype(pp->p_pagenum))
#define	PP_2_SZC(pp)		(pp->p_szc)

#define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
#define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))

#if defined(__amd64)

/*
 * set the mtype range (called from page_get_{free,cache}list)
 *   - set range to above 4g if the system has more than 4g of memory and the
 *   amount of memory below 4g runs low otherwise set range to all of memory
 *   starting from the hi pfns.
 *
 * page_get_anylist gets its mtype range from the specified ddi_dma_attr_t.
 */
#define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
	mtype = mnoderangecnt - 1;					\
	if (RESTRICT4G_ALLOC) {						\
		VM_STAT_ADD(vmm_vmstats.restrict4gcnt);			\
		/* here only for > 4g systems */			\
		flags |= PGI_MT_RANGE4G;				\
	} else {							\
		flags |= PGI_MT_RANGE0;					\
	}								\
}

#elif defined(__i386)

/*
 * set the mtype range
 *   - kmem requests needs to be below 4g if restricted_kmemalloc is set.
 *   - for non kmem requests, set range to above 4g if the amount of memory
 *   below 4g runs low.
 */

#define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
	if (restricted_kmemalloc && (vp) == &kvp &&			\
	    (caddr_t)(vaddr) >= kernelheap &&				\
	    (caddr_t)(vaddr) < ekernelheap) {				\
		ASSERT(physmax4g);					\
		mtype = mtype4g;					\
		flags |= PGI_MT_RANGE0;					\
	} else {							\
		mtype = mnoderangecnt - 1;				\
		if (RESTRICT4G_ALLOC) {					\
			VM_STAT_ADD(vmm_vmstats.restrict4gcnt);		\
			/* here only for > 4g systems */		\
			flags |= PGI_MT_RANGE4G;			\
		} else {						\
			flags |= PGI_MT_RANGE0;				\
		}							\
	}								\
}

#endif	/* __i386 */

/*
 * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
 * and page_get_contig_pages)
 *
 * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
 * not contain mnode.
 *
 * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
 * mtype in the range.
 */

#define	MTYPE_START(mnode, mtype, flags)				\
	(mtype = mtype_func(mnode, mtype, flags))

#define	MTYPE_NEXT(mnode, mtype, flags)					\
	(mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT))

/* mtype init for page_get_replacement_page */

#define	MTYPE_PGR_INIT(mtype, flags, pp, mnode) {			\
	mtype = mnoderangecnt - 1;					\
	flags |= PGI_MT_RANGE0;						\
}

#define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
	ASSERT(mnoderanges[mtype].mnr_mnode == mnode);			\
	pfnlo = mnoderanges[mtype].mnr_pfnlo;				\
	pfnhi = mnoderanges[mtype].mnr_pfnhi;

#define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])

#define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
#define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])

#ifdef DEBUG
#define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
extern void	chk_lpg(page_t *, uchar_t);
#else
#define	CHK_LPG(pp, szc)
#endif

#define	FULL_REGION_CNT(rg_szc)	\
	(LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))

/* Return the leader for this mapping size */
#define	PP_GROUPLEADER(pp, szc) \
	(&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])

/* Return the root page for this page based on p_szc */
#define	PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
	PP_GROUPLEADER((pp), (pp)->p_szc))

/*
 * The counter base must be per page_counter element to prevent
 * races when re-indexing, and the base page size element should
 * be aligned on a boundary of the given region size.
 *
 * We also round up the number of pages spanned by the counters
 * for a given region to PC_BASE_ALIGN in certain situations to simplify
 * the coding for some non-performance critical routines.
 */

#define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
#define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)

/*
 * cpu/mmu-dependent vm variables
 */
extern uint_t mmu_page_sizes;
extern uint_t mmu_exported_page_sizes;

/* For x86, userszc is the same as the kernel's szc */
#define	USERSZC_2_SZC(userszc)	(userszc)
#define	SZC_2_USERSZC(szc)	(szc)

/*
 * for hw_page_map_t, sized to hold the ratio of large page to base
 * pagesize (1024 max)
 */
typedef	short	hpmctr_t;

/*
 * get the setsize of the current cpu - assume homogenous for x86
 */
extern int	l2cache_sz, l2cache_linesz, l2cache_assoc;

#define	L2CACHE_ALIGN		l2cache_linesz
#define	CPUSETSIZE()		\
	(l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)

/*
 * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
 * for the number of base pages in this pagesize
 */
#define	PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)

/*
 * Internal PG_ flags.
 */
#define	PGI_RELOCONLY	0x010000	/* opposite of PG_NORELOC */
#define	PGI_NOCAGE	0x020000	/* cage is disabled */
#define	PGI_PGCPHIPRI	0x040000	/* page_get_contig_page pri alloc */
#define	PGI_PGCPSZC0	0x080000	/* relocate base pagesize page */

/*
 * PGI range flags - should not overlap PGI flags
 */
#define	PGI_MT_RANGE0	0x1000000	/* mtype range to 0 */
#define	PGI_MT_RANGE4G	0x2000000	/* mtype range to 4g */
#define	PGI_MT_NEXT	0x4000000	/* get next mtype */
#define	PGI_MT_RANGE	(PGI_MT_RANGE0 | PGI_MT_RANGE4G)

/*
 * hash as and addr to get a bin.
 */

#define	AS_2_BIN(as, seg, vp, addr, bin)				\
	bin = ((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
	    & page_colors_mask)

/*
 * When a bin is empty, and we can't satisfy a color request correctly,
 * we scan.  If we assume that the programs have reasonable spatial
 * behavior, then it will not be a good idea to use the adjacent color.
 * Using the adjacent color would result in virtually adjacent addresses
 * mapping into the same spot in the cache.  So, if we stumble across
 * an empty bin, skip a bunch before looking.  After the first skip,
 * then just look one bin at a time so we don't miss our cache on
 * every look. Be sure to check every bin.  Page_create() will panic
 * if we miss a page.
 *
 * This also explains the `<=' in the for loops in both page_get_freelist()
 * and page_get_cachelist().  Since we checked the target bin, skipped
 * a bunch, then continued one a time, we wind up checking the target bin
 * twice to make sure we get all of them bins.
 */
#define	BIN_STEP	19

#ifdef VM_STATS
struct vmm_vmstats_str {
	ulong_t pc_list_add_pages[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages1[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages2[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages3[MMU_PAGE_SIZES];
	ulong_t pgf_alloc[MMU_PAGE_SIZES];
	ulong_t pgf_allocok[MMU_PAGE_SIZES];
	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgf_allocdeferred;
	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
	ulong_t pgc_alloc;
	ulong_t pgc_allocok;
	ulong_t pgc_allocokrem;
	ulong_t pgc_allocokdeferred;
	ulong_t pgc_allocfailed;
	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
	ulong_t	ptcp[MMU_PAGE_SIZES];
	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
	ulong_t	ptcpok[MMU_PAGE_SIZES];
	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
	ulong_t	pgmc_alloc;
	ulong_t	pgmc_allocfailed;
	ulong_t	pgmc_allocempty;
	ulong_t	pgmc_allocok;
	ulong_t	ppr_reloc[MMU_PAGE_SIZES];
	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
	ulong_t ppr_relocok[MMU_PAGE_SIZES];
	ulong_t page_ctrs_coalesce;	/* page coalesce counter */
	ulong_t page_ctrs_cands_skip;	/* candidates useful */
	ulong_t page_ctrs_changed;	/* ctrs changed after locking */
	ulong_t page_ctrs_failed;	/* page_freelist_coalesce failed */
	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
	ulong_t	restrict4gcnt;
};
extern struct vmm_vmstats_str vmm_vmstats;
#endif	/* VM_STATS */

extern size_t page_ctrs_sz(void);
extern caddr_t page_ctrs_alloc(caddr_t);
extern void page_ctr_sub(page_t *, int);
extern page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
extern uint_t page_get_pagecolors(uint_t);

#ifdef	__cplusplus
}
#endif

#endif	/* _VM_DEP_H */